diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/.gitignore b/trunk/3rdparty/ffmpeg-4.2-fit/.gitignore index c348b5e28..9ce1b69cd 100644 --- a/trunk/3rdparty/ffmpeg-4.2-fit/.gitignore +++ b/trunk/3rdparty/ffmpeg-4.2-fit/.gitignore @@ -14,4 +14,5 @@ ffbuild/.config libavutil/lib.version libavcodec/libavcodec.version libavutil/libavutil.version -libswresample/libswresample.version \ No newline at end of file +libswresample/libswresample.version +libavutil/ffversion.h \ No newline at end of file diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile new file mode 100644 index 000000000..00f93bf59 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile @@ -0,0 +1,60 @@ +# subsystems +OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o +OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o +OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o +OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o +OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o +OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o +OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o +OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o +OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o +OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o +OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o + +# decoders/encoders +OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \ + aarch64/sbrdsp_init_aarch64.o +OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o +OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o +OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o +OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o +OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o +OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \ + aarch64/vp9dsp_init_12bpp_aarch64.o \ + aarch64/vp9dsp_init_aarch64.o + +# ARMv8 optimizations + +# subsystems +ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o + +# NEON optimizations + +# subsystems +NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o +NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o +NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o +NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o +NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ + aarch64/h264idct_neon.o +NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o +NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ + aarch64/hpeldsp_neon.o +NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \ + aarch64/simple_idct_neon.o +NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o +NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o +NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o + +# decoders/encoders +NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o +NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o +NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o +NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o +NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ + aarch64/vp9itxfm_neon.o \ + aarch64/vp9lpf_16bpp_neon.o \ + aarch64/vp9lpf_neon.o \ + aarch64/vp9mc_16bpp_neon.o \ + aarch64/vp9mc_neon.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile new file mode 100644 index 000000000..194135daf --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile @@ -0,0 +1,199 @@ +OBJS += x86/constants.o \ + +# subsystems +OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o +OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o +OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o +OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o +OBJS-$(CONFIG_DCT) += x86/dct_init.o +OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ + x86/dirac_dwt_init.o +OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o +OBJS-$(CONFIG_FFT) += x86/fft_init.o +OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o +OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o +OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o +OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o +OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o +OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o +OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o +OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o +OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o +OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o +OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o +OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o +OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o +OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o +OBJS-$(CONFIG_LPC) += x86/lpc.o +OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o +OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o +OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o +OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ + x86/mpegvideodsp.o +OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \ + x86/mpegvideoencdsp_init.o +OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o +OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o +OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o +OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o +OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o +OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o +OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o + +# decoders/encoders +OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \ + x86/sbrdsp_init.o +OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o +OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o +OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o +OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o +OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o +OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o +OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o +OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o +OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o +OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o +OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o +OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o +OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o +OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o +OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o +OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o +OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o +OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o +OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o +OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o +OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o +OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o +OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o +OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o +OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o +OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o +OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o +OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o +OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o +OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o +OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o +OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \ + x86/vp9dsp_init_10bpp.o \ + x86/vp9dsp_init_12bpp.o \ + x86/vp9dsp_init_16bpp.o +OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o + + +# GCC inline assembly optimizations +# subsystems +MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o +MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o + +# decoders/encoders +MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o +MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o + +# subsystems +X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \ + x86/ac3dsp_downmix.o +X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o +X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o +X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o +X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o +X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o +X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o +X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o +X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ + x86/h264_chromamc_10bit.o +X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ + x86/h264_deblock_10bit.o \ + x86/h264_idct.o \ + x86/h264_idct_10bit.o \ + x86/h264_weight.o \ + x86/h264_weight_10bit.o +X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ + x86/h264_intrapred_10bit.o +X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ + x86/h264_qpel_10bit.o \ + x86/fpel.o \ + x86/qpel.o +X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ + x86/hpeldsp.o +X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o +X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o +X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o +X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o +X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o +X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o +X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o +X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o +X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o +X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o +X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o +X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o +X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o +X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ + x86/fpel.o \ + x86/qpel.o +X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o +X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \ + x86/vc1dsp_mc.o +X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \ + x86/simple_idct.o +X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o +X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o +X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \ + x86/vp8dsp_loopfilter.o + +# decoders/encoders +X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \ + x86/sbrdsp.o +X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o +X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o +X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o +X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o +X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o +X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o +X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o +X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ + x86/dirac_dwt.o +X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o +X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o +X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o +ifdef CONFIG_GPL +X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o +endif +X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \ + x86/hevc_deblock.o \ + x86/hevc_idct.o \ + x86/hevc_mc.o \ + x86/hevc_sao.o \ + x86/hevc_sao_10bit.o +X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o +X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o +X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o +X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o +X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o +X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o +X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o +X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o +X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o +X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o +X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o +X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o +X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o +X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o +X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o +X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o +X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o +X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o +X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ + x86/vp9intrapred_16bpp.o \ + x86/vp9itxfm.o \ + x86/vp9itxfm_16bpp.o \ + x86/vp9lpf.o \ + x86/vp9lpf_16bpp.o \ + x86/vp9mc.o \ + x86/vp9mc_16bpp.o +X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm new file mode 100644 index 000000000..97af571ec --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm @@ -0,0 +1,86 @@ +;****************************************************************************** +;* SIMD optimized AAC encoder DSP functions +;* +;* Copyright (C) 2016 Rostislav Pehlivanov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +float_abs_mask: times 4 dd 0x7fffffff + +SECTION .text + +;******************************************************************* +;void ff_abs_pow34(float *out, const float *in, const int size); +;******************************************************************* +INIT_XMM sse +cglobal abs_pow34, 3, 3, 3, out, in, size + mova m2, [float_abs_mask] + shl sizeq, 2 + add inq, sizeq + add outq, sizeq + neg sizeq +.loop: + andps m0, m2, [inq+sizeq] + sqrtps m1, m0 + mulps m0, m1 + sqrtps m0, m0 + mova [outq+sizeq], m0 + add sizeq, mmsize + jl .loop + RET + +;******************************************************************* +;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled, +; int size, int is_signed, int maxval, const float Q34, +; const float rounding) +;******************************************************************* +INIT_XMM sse2 +cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding +%if UNIX64 == 0 + movss m0, Q34m + movss m1, roundingm + cvtsi2ss m3, dword maxvalm +%else + cvtsi2ss m3, maxvald +%endif + shufps m0, m0, 0 + shufps m1, m1, 0 + shufps m3, m3, 0 + shl is_signedd, 31 + movd m4, is_signedd + shufps m4, m4, 0 + shl sized, 2 + add inq, sizeq + add outq, sizeq + add scaledq, sizeq + neg sizeq +.loop: + mulps m2, m0, [scaledq+sizeq] + addps m2, m1 + minps m2, m3 + andps m5, m4, [inq+sizeq] + orps m2, m5 + cvttps2dq m2, m2 + mova [outq+sizeq], m2 + add sizeq, mmsize + jl .loop + RET diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm new file mode 100644 index 000000000..4acd087c8 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm @@ -0,0 +1,487 @@ +;****************************************************************************** +;* SIMD optimized MPEG-4 Parametric Stereo decoding functions +;* +;* Copyright (C) 2015 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 + +SECTION .text + +;************************************************************************* +;void ff_ps_add_squares_(float *dst, const float (*src)[2], int n); +;************************************************************************* +%macro PS_ADD_SQUARES 1 +cglobal ps_add_squares, 3, 3, %1, dst, src, n + shl nd, 3 + add srcq, nq + neg nq + +align 16 +.loop: + movaps m0, [srcq+nq] + movaps m1, [srcq+nq+mmsize] + mulps m0, m0 + mulps m1, m1 + HADDPS m0, m1, m2 + addps m0, [dstq] + movaps [dstq], m0 + add dstq, mmsize + add nq, mmsize*2 + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +PS_ADD_SQUARES 2 +INIT_XMM sse3 +PS_ADD_SQUARES 3 + +;******************************************************************* +;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2], +; float *src1, int n); +;******************************************************************* +INIT_XMM sse +cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n + shl nd, 3 + add src1q, nq + add dstq, nq + neg nq + +align 16 +.loop: + movu m0, [src1q+nq] + movu m1, [src1q+nq+mmsize] + mova m2, [src2q] + mova m3, m2 + unpcklps m2, m2 + unpckhps m3, m3 + mulps m0, m2 + mulps m1, m3 + mova [dstq+nq], m0 + mova [dstq+nq+mmsize], m1 + add src2q, mmsize + add nq, mmsize*2 + jl .loop + REP_RET + +;*********************************************************************** +;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], +; float h[2][4], float h_step[2][4], +; int len); +;*********************************************************************** +INIT_XMM sse3 +cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n + movaps m0, [hq] + movaps m1, [h_stepq] + unpcklps m4, m0, m0 + unpckhps m0, m0 + unpcklps m5, m1, m1 + unpckhps m1, m1 + shl nd, 3 + add lq, nq + add rq, nq + neg nq + +align 16 +.loop: + addps m4, m5 + addps m0, m1 + movddup m2, [lq+nq] + movddup m3, [rq+nq] + mulps m2, m4 + mulps m3, m0 + addps m2, m3 + movsd [lq+nq], m2 + movhps [rq+nq], m2 + add nq, 8 + jl .loop + REP_RET + +;*************************************************************************** +;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], +; float h[2][4], float h_step[2][4], +; int len); +;*************************************************************************** +INIT_XMM sse3 +cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n + movaps m0, [hq] + movaps m1, [hq+mmsize] +%if ARCH_X86_64 + movaps m8, [h_stepq] + movaps m9, [h_stepq+mmsize] + %define H_STEP0 m8 + %define H_STEP1 m9 +%else + %define H_STEP0 [h_stepq] + %define H_STEP1 [h_stepq+mmsize] +%endif + shl nd, 3 + add lq, nq + add rq, nq + neg nq + +align 16 +.loop: + addps m0, H_STEP0 + addps m1, H_STEP1 + movddup m2, [lq+nq] + movddup m3, [rq+nq] + shufps m4, m2, m2, q2301 + shufps m5, m3, m3, q2301 + unpcklps m6, m0, m0 + unpckhps m7, m0, m0 + mulps m2, m6 + mulps m3, m7 + unpcklps m6, m1, m1 + unpckhps m7, m1, m1 + mulps m4, m6 + mulps m5, m7 + addps m2, m3 + addsubps m2, m4 + addsubps m2, m5 + movsd [lq+nq], m2 + movhps [rq+nq], m2 + add nq, 8 + jl .loop + REP_RET + +;********************************************************** +;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;********************************************************** +INIT_XMM sse +cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp + movsxdifnidn iq, id + mov lend, 32 << 3 + lea inq, [inq+iq*4] + mov tmpd, id + shl tmpd, 8 + add outq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [in0q] + movaps m1, [in1q] + movaps m2, [in0q+lenq] + movaps m3, [in1q+lenq] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [outq], m0 + movaps [outq+lenq], m1 + movaps [outq+lenq*2], m2 + movaps [outq+3*32*2*4], m3 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add inq, 16 + add outq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop8: + movlps m0, [in0q] + movlps m1, [in1q] + movhps m0, [in0q+lenq] + movhps m1, [in1q+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + movaps [outq], m0 + movaps [outq+lenq], m1 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add inq, 8 + add outq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop4: + movss m0, [in0q] + movss m1, [in1q] + movss m2, [in0q+lenq] + movss m3, [in1q+lenq] + movlhps m0, m1 + movlhps m2, m3 + shufps m0, m2, q2020 + movaps [outq], m0 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add inq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET + +;*********************************************************** +;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;*********************************************************** +%macro HYBRID_SYNTHESIS_DEINT 0 +cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp +%if cpuflag(sse4) +%define MOVH movsd +%else +%define MOVH movlps +%endif + movsxdifnidn iq, id + mov lend, 32 << 3 + lea outq, [outq+iq*4] + mov tmpd, id + shl tmpd, 8 + add inq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [inq] + movaps m1, [inq+lenq] + movaps m2, [inq+lenq*2] + movaps m3, [inq+3*32*2*4] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [out0q], m0 + movaps [out1q], m1 + movaps [out0q+lenq], m2 + movaps [out1q+lenq], m3 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add outq, 16 + add inq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop8: + movaps m0, [inq] + movaps m1, [inq+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + MOVH [out0q], m0 + MOVH [out1q], m1 + movhps [out0q+lenq], m0 + movhps [out1q+lenq], m1 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add outq, 8 + add inq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop4: + movaps m0, [inq] + movss [out0q], m0 +%if cpuflag(sse4) + extractps [out1q], m0, 1 + extractps [out0q+lenq], m0, 2 + extractps [out1q+lenq], m0, 3 +%else + movhlps m1, m0 + movss [out0q+lenq], m1 + shufps m0, m0, 0xb1 + movss [out1q], m0 + movhlps m1, m0 + movss [out1q+lenq], m1 +%endif + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add outq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET +%endmacro + +INIT_XMM sse +HYBRID_SYNTHESIS_DEINT +INIT_XMM sse4 +HYBRID_SYNTHESIS_DEINT + +;******************************************************************* +;void ff_ps_hybrid_analysis_(float (*out)[2], float (*in)[2], +; const float (*filter)[8][2], +; ptrdiff_t stride, int n); +;******************************************************************* +%macro PS_HYBRID_ANALYSIS_LOOP 3 + movu %1, [inq+mmsize*%3] + movu m1, [inq+mmsize*(5-%3)+8] +%if cpuflag(sse3) + pshufd %2, %1, q2301 + pshufd m4, m1, q0123 + pshufd m1, m1, q1032 + pshufd m2, [filterq+nq+mmsize*%3], q2301 + addsubps %2, m4 + addsubps %1, m1 +%else + mova m2, [filterq+nq+mmsize*%3] + mova %2, %1 + mova m4, m1 + shufps %2, %2, q2301 + shufps m4, m4, q0123 + shufps m1, m1, q1032 + shufps m2, m2, q2301 + xorps m4, m7 + xorps m1, m7 + subps %2, m4 + subps %1, m1 +%endif + mulps %2, m2 + mulps %1, m2 +%if %3 + addps m3, %2 + addps m0, %1 +%endif +%endmacro + +%macro PS_HYBRID_ANALYSIS 0 +cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n +%if cpuflag(sse3) +%define MOVH movsd +%else +%define MOVH movlps +%endif + shl strideq, 3 + shl nd, 6 + add filterq, nq + neg nq + mova m7, [ps_p1m1p1m1] + +align 16 +.loop: + PS_HYBRID_ANALYSIS_LOOP m0, m3, 0 + PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 + PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 + +%if cpuflag(sse3) + pshufd m3, m3, q2301 + xorps m0, m7 + hsubps m3, m0 + pshufd m1, m3, q0020 + pshufd m3, m3, q0031 + addps m1, m3 + movsd m2, [inq+6*8] +%else + mova m1, m3 + mova m2, m0 + shufps m1, m1, q2301 + shufps m2, m2, q2301 + subps m1, m3 + addps m2, m0 + unpcklps m3, m1, m2 + unpckhps m1, m2 + addps m1, m3 + movu m2, [inq+6*8] ; faster than movlps and no risk of overread +%endif + movss m3, [filterq+nq+8*6] + SPLATD m3 + mulps m2, m3 + addps m1, m2 + MOVH [outq], m1 + add outq, strideq + add nq, 64 + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +PS_HYBRID_ANALYSIS +INIT_XMM sse3 +PS_HYBRID_ANALYSIS diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm new file mode 100644 index 000000000..5c1e6d617 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm @@ -0,0 +1,385 @@ +;****************************************************************************** +;* SIMD optimized Opus encoder DSP function +;* +;* Copyright (C) 2017 Ivan Kalvachev +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "config.asm" +%include "libavutil/x86/x86util.asm" + +%ifdef __NASM_VER__ +%use "smartalign" +ALIGNMODE p6 +%endif + +SECTION_RODATA 64 + +const_float_abs_mask: times 8 dd 0x7fffffff +const_align_abs_edge: times 8 dd 0 + +const_float_0_5: times 8 dd 0.5 +const_float_1: times 8 dd 1.0 +const_float_sign_mask: times 8 dd 0x80000000 + +const_int32_offsets: + %rep 8 + dd $-const_int32_offsets + %endrep +SECTION .text + +; +; Setup High Register to be used +; for holding memory constants +; +; %1 - the register to be used, assmues it is >= mm8 +; %2 - name of the constant. +; +; Subsequent opcodes are going to use the constant in the form +; "addps m0, mm_const_name" and it would be turned into: +; "addps m0, [const_name]" on 32 bit arch or +; "addps m0, m8" on 64 bit arch +%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name +%if num_mmregs > 8 + %define mm_%3 %2 + %{1} %2, [%3] ; movaps m8, [const_name] +%else + %define mm_%3 [%3] +%endif +%endmacro + +; +; Set Position Independent Code +; Base address of a constant +; %1 - the register to be used, if PIC is set +; %2 - name of the constant. +; +; Subsequent opcode are going to use the base address in the form +; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into +; "movaps m0, [r5 + r4]" if PIC is enabled +; "movaps m0, [constant_name + r4]" if texrel are used +%macro SET_PIC_BASE 3; reg, const_label +%ifdef PIC + %{1} %2, [%3] ; lea r5, [rip+const] + %define pic_base_%3 %2 +%else + %define pic_base_%3 %3 +%endif +%endmacro + +%macro PULSES_SEARCH 1 +; m6 Syy_norm +; m7 Sxy_norm + addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 + pxor m1, m1 ; max_idx + xorps m3, m3 ; p_max + xor r4d, r4d +align 16 +%%distortion_search: + movd xm2, dword r4d ; movd zero extends +%ifidn %1,add + movaps m4, [tmpY + r4] ; y[i] + movaps m5, [tmpX + r4] ; X[i] + + %if USE_APPROXIMATION == 1 + xorps m0, m0 + cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) + %endif + + addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm + addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm + + %if USE_APPROXIMATION == 1 + andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. + %endif + +%else + movaps m5, [tmpY + r4] ; m5 = y[i] + + xorps m0, m0 ; m0 = 0; + cmpps m0, m0, m5, 1 ; m0 = (0 p_max) + maxps m3, m5 ; m3=max(p_max,p) + ; maxps here is faster than blendvps, despite blend having lower latency. + + pand m2, m0 ; This version seems faster than sse41 pblendvb + pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 + + add r4d, mmsize + cmp r4d, Nd + jb %%distortion_search + + por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) + movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing + +%if mmsize >= 32 +; Merge parallel maximums round 8 (4 vs 4) + + vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] + cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) + + vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] + BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] + PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128] +%endif + +; Merge parallel maximums round 4 (2 vs 2) + ; m3=p[3210] + movhlps xm5, xm3 ; m5=p[xx32] + cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) + + pshufd xm2, xm1, q3232 + BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] + PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0] + +; Merge parallel maximums final round (1 vs 1) + shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1] + cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] ) + + pshufd xm2, xm1, q1111 + PBLENDVB xm1, xm2, xm0 + + movd dword r4d, xm1 ; zero extends to the rest of r4q + + VBROADCASTSS m3, [tmpX + r4] + %{1}ps m7, m3 ; Sxy += X[max_idx] + + VBROADCASTSS m5, [tmpY + r4] + %{1}ps m6, m5 ; Syy += Y[max_idx] + + ; We have to update a single element in Y[i] + ; However writing 4 bytes and then doing 16 byte load in the inner loop + ; could cause a stall due to breaking write forwarding. + VPBROADCASTD m1, xm1 + pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it + + and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load + movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0] + andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] + %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] + movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0; +%endmacro + +; +; We need one more register for +; PIC relative addressing. Use this +; to count it in cglobal +; +%ifdef PIC + %define num_pic_regs 1 +%else + %define num_pic_regs 0 +%endif + +; +; Pyramid Vector Quantization Search implementation +; +; float * inX - Unaligned (SIMD) access, it will be overread, +; but extra data is masked away. +; int32 * outY - Should be aligned and padded buffer. +; It is used as temp buffer. +; uint32 K - Number of pulses to have after quantizations. +; uint32 N - Number of vector elements. Must be 0 < N < 256 +; +%macro PVQ_FAST_SEARCH 1 +cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N +%define tmpX rsp +%define tmpY outYq + + movaps m0, [const_float_abs_mask] + shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode. + mov r4d, Nd + + neg r4d + and r4d, mmsize-1 + + SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const + movups m2, [pic_base_const_align_abs_edge + r4 - mmsize] + + add Nd, r4d ; N = align(N, mmsize) + + lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0. + movups m1, [inXq + r4] + andps m1, m2 + movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] ) + +align 16 +%%loop_abs_sum: + sub r4d, mmsize + jc %%end_loop_abs_sum + + movups m2, [inXq + r4] + andps m2, m0 + + movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i]) + addps m1, m2 ; Sx += abs(X[i]) + jmp %%loop_abs_sum + +align 16 +%%end_loop_abs_sum: + + HSUMPS m1, m2 ; m1 = Sx + + xorps m0, m0 + comiss xm0, xm1 ; + jz %%zero_input ; if (Sx==0) goto zero_input + + cvtsi2ss xm0, dword Kd ; m0 = K +%if USE_APPROXIMATION == 1 + rcpss xm1, xm1 ; m1 = approx(1/Sx) + mulss xm0, xm1 ; m0 = K*(1/Sx) +%else + divss xm0, xm1 ; b = K/Sx + ; b = K/max_x +%endif + + VBROADCASTSS m0, xm0 + + lea r4d, [Nd - mmsize] + pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) + xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) + xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) +align 16 +%%loop_guess: + movaps m1, [tmpX + r4] ; m1 = X[i] + mulps m2, m0, m1 ; m2 = res*X[i] + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) + paddd m5, m2 ; Sy += yt + cvtdq2ps m2, m2 ; yt = (float)yt + mulps m1, m2 ; m1 = X[i]*yt + movaps [tmpY + r4], m2 ; y[i] = m2 + addps m7, m1 ; Sxy += m1; + mulps m2, m2 ; m2 = yt*yt + addps m6, m2 ; Syy += m2 + + sub r4d, mmsize + jnc %%loop_guess + + HSUMPS m6, m1 ; Syy_norm + HADDD m5, m4 ; pulses + + movd dword r4d, xm5 ; zero extends to the rest of r4q + + sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. + jz %%finish ; K - pulses == 0 + + SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 + SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 + SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets + ; Use Syy/2 in distortion parameter calculations. + ; Saves pre and post-caclulation to correct Y[] values. + ; Same precision, since float mantisa is normalized. + ; The SQRT approximation does differ. + HSUMPS m7, m0 ; Sxy_norm + mulps m6, mm_const_float_0_5 + + jc %%remove_pulses_loop ; K - pulses < 0 + +align 16 ; K - pulses > 0 +%%add_pulses_loop: + + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + + sub Kd, 1 + jnz %%add_pulses_loop + + addps m6, m6 ; Syy*=2 + + jmp %%finish + +align 16 +%%remove_pulses_loop: + + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + + add Kd, 1 + jnz %%remove_pulses_loop + + addps m6, m6 ; Syy*=2 + +align 16 +%%finish: + lea r4d, [Nd - mmsize] + movaps m2, [const_float_sign_mask] + +align 16 +%%restore_sign_loop: + movaps m0, [tmpY + r4] ; m0 = Y[i] + movups m1, [inXq + r4] ; m1 = X[i] + andps m1, m2 ; m1 = sign(X[i]) + orps m0, m1 ; m0 = Y[i]*sign + cvtps2dq m3, m0 ; m3 = (int)m0 + movaps [outYq + r4], m3 + + sub r4d, mmsize + jnc %%restore_sign_loop +%%return: + +%if ARCH_X86_64 == 0 ; sbrdsp + movss r0m, xm6 ; return (float)Syy_norm + fld dword r0m +%else + movaps m0, m6 ; return (float)Syy_norm +%endif + + RET + +align 16 +%%zero_input: + lea r4d, [Nd - mmsize] + xorps m0, m0 +%%zero_loop: + movaps [outYq + r4], m0 + sub r4d, mmsize + jnc %%zero_loop + + movaps m6, [const_float_1] + jmp %%return +%endmacro + +; if 1, use a float op that give half precision but execute for around 3 cycles. +; On Skylake & Ryzen the division is much faster (around 11c/3), +; that makes the full precision code about 2% slower. +; Opus also does use rsqrt approximation in their intrinsics code. +%define USE_APPROXIMATION 1 + +INIT_XMM sse2 +PVQ_FAST_SEARCH _approx + +INIT_XMM sse4 +PVQ_FAST_SEARCH _approx + +%define USE_APPROXIMATION 0 + +INIT_XMM avx +PVQ_FAST_SEARCH _exact diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm new file mode 100644 index 000000000..a671e8f48 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm @@ -0,0 +1,1085 @@ +;****************************************************************************** +;* FFT transform with SSE/3DNow optimizations +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2011 Vitor Sessak +;* +;* This algorithm (though not any of the implementation details) is +;* based on libdjbfft by D. J. Bernstein. +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc FFTContext + .nbits: resd 1 + .reverse: resd 1 + .revtab: pointer 1 + .tmpbuf: pointer 1 + .mdctsize: resd 1 + .mdctbits: resd 1 + .tcos: pointer 1 + .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 +endstruc + +SECTION_RODATA 32 + +%define M_SQRT1_2 0.70710678118654752440 +%define M_COS_PI_1_8 0.923879532511287 +%define M_COS_PI_3_8 0.38268343236509 + +ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 +ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 + +ps_root2: times 8 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 + +perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 +perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 +ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1p1: dd 1<<31, 0 + +cextern ps_neg + +%assign i 16 +%rep 14 +cextern cos_ %+ i +%assign i i<<1 +%endrep + +%if ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +SECTION .text + +%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 + mova %1, %3 + mova %2, %1 + pfadd %1, %4 + pfsub %2, %4 +%endmacro + +%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 + mova %5, %3 + pfsub %3, %4 + pfadd %5, %4 ; {t6,t5} + pxor %3, [ps_m1p1] ; {t8,t7} + mova %6, %1 + movd [r0+12], %3 + punpckhdq %3, [r0+8] + pfadd %1, %5 ; {r0,i0} + pfsub %6, %5 ; {r2,i2} + mova %4, %2 + pfadd %2, %3 ; {r1,i1} + pfsub %4, %3 ; {r3,i3} + SWAP %3, %6 +%endmacro + +; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} +; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} +; %3, %4, %5 tmp +; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} +; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} +%macro T8_AVX 5 + vsubps %5, %1, %2 ; v = %1 - %2 + vaddps %3, %1, %2 ; w = %1 + %2 + vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 + vpermilps %2, %2, [perm1] + vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} + vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} + vsubps %4, %5, %1 ; s = r - q + vaddps %1, %5, %1 ; u = r + q + vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} + vshufps %5, %4, %1, 0xbb + vshufps %3, %4, %1, 0xee + vperm2f128 %3, %3, %5, 0x13 + vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} + vshufps %2, %1, %4, 0xdd + vshufps %1, %1, %4, 0x88 + vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} + vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} + vsubps %5, %1, %3 + vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} + vsubps %2, %4, %1 ; %2 = v - w + vaddps %1, %4, %1 ; %1 = v + w +%endmacro + +; In SSE mode do one fft4 transforms +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +; +; In AVX mode do two fft4 transforms +; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} +%macro T4_SSE 3 + subps %3, %1, %2 ; {t3,t4,-t8,t7} + addps %1, %1, %2 ; {t1,t2,t6,t5} + xorps %3, %3, [ps_p1p1m1p1] + shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} + shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} + subps %3, %1, %2 ; {r2,i2,r3,i3} + addps %1, %1, %2 ; {r0,i0,r1,i1} + shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} + shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} +%endmacro + +; In SSE mode do one FFT8 +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +; +; In AVX mode do two FFT8 +; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} +; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} +; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} +; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} +%macro T8_SSE 6 + addps %6, %3, %4 ; {t1,t2,t3,t4} + subps %3, %3, %4 ; {r5,i5,r7,i7} + shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} + mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %4, %4, [ps_root2] + addps %3, %3, %4 ; {t8,t7,ta,t9} + shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} + shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} + subps %3, %6, %4 ; {t6,t5,tc,tb} + addps %6, %6, %4 ; {t1,t2,t9,ta} + shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} + shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} + subps %3, %1, %6 ; {r4,r5,r6,r7} + addps %1, %1, %6 ; {r0,r1,r2,r3} + subps %4, %2, %5 ; {i4,i5,i6,i7} + addps %2, %2, %5 ; {i0,i1,i2,i3} +%endmacro + +%macro INTERL 5 +%if cpuflag(avx) + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%elif cpuflag(sse) || cpuflag(3dnow) + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endif +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m1, %3 ; wim + mulps m2, m4, m0 ; r2*wre +IF%1 mova m6, Z2(6) + mulps m3, m5, m1 ; i2*wim +IF%1 mova m7, Z2(7) + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m1, m1, m6 ; r3*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 + mova Z2(6), m4 + mova Z(2), m3 + mova m2, Z(3) + addps m3, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 + mova Z2(7), m2 + mova Z(3), m1 + subps m4, m7, m3 ; i2 + addps m3, m3, m7 ; i0 + mova Z(5), m4 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m0, [wq] ; wre + mova m1, [wq+o1q] ; wim + mulps m2, m4, m0 ; r2*wre + mova m6, Z2(6) ; r3 + mulps m3, m5, m1 ; i2*wim + mova m7, Z2(7) ; i3 + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + mulps m1, m1, m6 ; r3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 +IF%1 mova Z2(6), m4 +IF%1 mova Z(2), m3 + mova m2, Z(3) + addps m5, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 +IF%1 mova Z2(7), m2 +IF%1 mova Z(3), m1 + subps m6, m7, m5 ; i2 + addps m5, m5, m7 ; i0 +IF%1 mova Z(5), m6 +IF%1 mova Z(1), m5 +%if %1==0 + INTERL m1, m3, m7, Z, 2 + INTERL m2, m4, m0, Z2, 6 + + mova m1, Z(0) + mova m2, Z(4) + + INTERL m5, m1, m3, Z, 0 + INTERL m6, m2, m7, Z, 4 +%endif +%endmacro + +%macro PUNPCK 3 + mova %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +%define Z(x) [r0+mmsize*x] +%define Z2(x) [r0+mmsize*x] +%define ZH(x) [r0+mmsize*x+mmsize/2] + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +align 16 +fft8_avx: + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m2, m3, m4 + mova Z(0), m0 + mova Z(1), m1 + ret + + +align 16 +fft16_avx: + mova m2, Z(2) + mova m3, Z(3) + T4_SSE m2, m3, m7 + + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m4, m5, m7 + + mova m4, [ps_cos16_1] + mova m5, [ps_cos16_2] + vmulps m6, m2, m4 + vmulps m7, m3, m5 + vaddps m7, m7, m6 + vmulps m2, m2, m5 + vmulps m3, m3, m4 + vsubps m3, m3, m2 + vblendps m2, m7, m3, 0xf0 + vperm2f128 m3, m7, m3, 0x21 + vaddps m4, m2, m3 + vsubps m2, m3, m2 + vperm2f128 m2, m2, m2, 0x01 + vsubps m3, m1, m2 + vaddps m1, m1, m2 + vsubps m5, m0, m4 + vaddps m0, m0, m4 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + vextractf128 Z(2), m5, 0 + vextractf128 ZH(2), m3, 0 + vextractf128 Z(3), m5, 1 + vextractf128 ZH(3), m3, 1 + ret + +align 16 +fft32_avx: + call fft16_avx + + mova m0, Z(4) + mova m1, Z(5) + + T4_SSE m0, m1, m4 + + mova m2, Z(6) + mova m3, Z(7) + + T8_SSE m0, m1, m2, m3, m4, m6 + ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} + ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} + + vperm2f128 m4, m0, m2, 0x20 + vperm2f128 m5, m1, m3, 0x20 + vperm2f128 m6, m0, m2, 0x31 + vperm2f128 m7, m1, m3, 0x31 + + PASS_SMALL 0, [cos_32], [cos_32+32] + + ret + +fft32_interleave_avx: + call fft32_avx + mov r2d, 32 +.deint_loop: + mova m2, Z(0) + mova m3, Z(1) + vunpcklps m0, m2, m3 + vunpckhps m1, m2, m3 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + add r0, mmsize*2 + sub r2d, mmsize/4 + jg .deint_loop + ret + +%endif + +INIT_XMM sse + +align 16 +fft4_avx: +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z2(6) + mova m7, Z2(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [cos_16], [cos_16+16] + ret + + +%macro FFT48_3DNOW 0 +align 16 +fft4 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + PUNPCK m0, m1, m4 + PUNPCK m2, m3, m5 + mova Z(0), m0 + mova Z(1), m4 + mova Z(2), m2 + mova Z(3), m5 + ret + +align 16 +fft8 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(2), m2 + T2_3DNOW m4, m5, Z(4), Z(5) + T2_3DNOW m6, m7, Z2(6), Z2(7) + PSWAPD m0, m5 + PSWAPD m2, m7 + pxor m0, [ps_m1p1] + pxor m2, [ps_m1p1] + pfsub m5, m0 + pfadd m7, m2 + pfmul m5, [ps_root2] + pfmul m7, [ps_root2] + T4_3DNOW m1, m3, m5, m7, m0, m2 + mova Z(5), m5 + mova Z2(7), m7 + mova m0, Z(0) + mova m2, Z(2) + T4_3DNOW m0, m2, m4, m6, m5, m7 + PUNPCK m0, m1, m5 + PUNPCK m2, m3, m7 + mova Z(0), m0 + mova Z(1), m5 + mova Z(2), m2 + mova Z(3), m7 + PUNPCK m4, Z(5), m5 + PUNPCK m6, Z2(7), m7 + mova Z(4), m4 + mova Z(5), m5 + mova Z2(6), m6 + mova Z2(7), m7 + ret +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnowext +FFT48_3DNOW + +INIT_MMX 3dnow +FFT48_3DNOW +%endif + +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS zc, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_PASS pass_avx, PASS_BIG 1 +DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + REP_RET + +%endif + +INIT_XMM sse + +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +%macro FFT_CALC_FUNC 0 +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 3+(mmsize/16) + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop: +%if mmsize == 8 + PSWAPD m0, [r4 + r2 + 4] + mova [r4 + r2 + 4], m0 +%else + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 +%endif + add r2, mmsize*2 + jl .loop +.end: +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +FFT_CALC_FUNC +INIT_MMX 3dnowext +FFT_CALC_FUNC +%endif +INIT_XMM sse +FFT_CALC_FUNC + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + REP_RET + +%macro IMDCT_CALC_FUNC 0 +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8+32*WIN64 ; allocate win64 shadow space +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8+32*WIN64 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_neg] +.loop: +%if mmsize == 8 + PSWAPD m0, [r1 + r3] + PSWAPD m1, [r0 + r2] + pxor m0, m2 +%else + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 +%endif + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +IMDCT_CALC_FUNC +INIT_MMX 3dnowext +IMDCT_CALC_FUNC +%endif + +INIT_XMM sse +IMDCT_CALC_FUNC + +%if ARCH_X86_32 +INIT_MMX 3dnow +%define mulps pfmul +%define addps pfadd +%define subps pfsub +%define unpcklps punpckldq +%define unpckhps punpckhdq +DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] +DECL_PASS pass_interleave_3dnow, PASS_BIG 0 +%define pass_3dnowext pass_3dnow +%define pass_interleave_3dnowext pass_interleave_3dnow +%endif + +%ifdef PIC +%define SECTION_REL - $$ +%else +%define SECTION_REL +%endif + +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL +%if %1>=5 +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL +%endif +%if %1>=6 +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL +%endif + +%assign n 1<<%1 +%rep 18-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL + +align 16 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [cos_ %+ n] + mov r2d, n4/2 + jmp pass %+ fullsuffix + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab %+ fullsuffix: pointer list_of_fft +%endmacro ; DECL_FFT + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave +%endif +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_FFT 4 +DECL_FFT 4, _interleave +INIT_MMX 3dnowext +DECL_FFT 4 +DECL_FFT 4, _interleave +%endif + +INIT_XMM sse +%undef mulps +%undef addps +%undef subps +%undef unpcklps +%undef unpckhps + +%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 +%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 + PSWAPD m0, [%3+%2*4] + movq m2, [%3+%1*4-8] + movq m3, m0 + punpckldq m0, m2 + punpckhdq m2, m3 + movd m1, [%4+%1*2-4] ; tcos[j] + movd m3, [%4+%2*2] ; tcos[n4-j-1] + punpckldq m1, [%5+%1*2-4] ; tsin[j] + punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] + + mova m4, m0 + PSWAPD m5, m1 + pfmul m0, m1 + pfmul m4, m5 + mova m6, m2 + PSWAPD m5, m3 + pfmul m2, m3 + pfmul m6, m5 +%if cpuflag(3dnowext) + pfpnacc m0, m4 + pfpnacc m2, m6 +%else + SBUTTERFLY dq, 0, 4, 1 + SBUTTERFLY dq, 2, 6, 3 + pxor m4, m7 + pxor m6, m7 + pfadd m0, m4 + pfadd m2, m6 +%endif +%else + movaps xmm0, [%3+%2*4] + movaps xmm1, [%3+%1*4-0x10] + movaps xmm2, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm1, xmm2, 0x77 + movlps xmm4, [%4+%2*2] + movlps xmm5, [%5+%2*2+0x0] + movhps xmm4, [%4+%1*2-0x8] + movhps xmm5, [%5+%1*2-0x8] + movaps xmm2, xmm0 + movaps xmm3, xmm1 + mulps xmm0, xmm5 + mulps xmm1, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + subps xmm1, xmm0 + addps xmm2, xmm3 + movaps xmm0, xmm1 + unpcklps xmm1, xmm2 + unpckhps xmm0, xmm2 +%endif +%endmacro + +%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 +%if cpuflag(sse) + mulps m6, %3, [%5+%1] + mulps m7, %2, [%5+%1] + mulps %2, %2, [%6+%1] + mulps %3, %3, [%6+%1] + subps %2, %2, m6 + addps %3, %3, m7 +%elif cpuflag(3dnow) + mova m6, [%1+%2*2] + mova %3, [%1+%2*2+8] + mova %4, m6 + mova m7, %3 + pfmul m6, [%5+%2] + pfmul %3, [%6+%2] + pfmul %4, [%6+%2] + pfmul m7, [%5+%2] + pfsub %3, m6 + pfadd %4, m7 +%endif +%endmacro + +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: +%if cpuflag(avx) + vmovaps ymm1, [%3+%1*2] + vmovaps ymm0, [%3+%1*2+0x20] + vmovaps ymm3, [%3+%2*2] + vmovaps ymm2, [%3+%2*2+0x20] + + CMUL %1, ymm0, ymm1, %3, %4, %5 + CMUL %2, ymm2, ymm3, %3, %4, %5 + vshufps ymm1, ymm1, ymm1, 0x1b + vshufps ymm3, ymm3, ymm3, 0x1b + vperm2f128 ymm1, ymm1, ymm1, 0x01 + vperm2f128 ymm3, ymm3, ymm3, 0x01 + vunpcklps ymm6, ymm2, ymm1 + vunpckhps ymm4, ymm2, ymm1 + vunpcklps ymm7, ymm0, ymm3 + vunpckhps ymm5, ymm0, ymm3 + + vextractf128 [%3+%1*2], ymm7, 0 + vextractf128 [%3+%1*2+0x10], ymm5, 0 + vextractf128 [%3+%1*2+0x20], ymm7, 1 + vextractf128 [%3+%1*2+0x30], ymm5, 1 + + vextractf128 [%3+%2*2], ymm6, 0 + vextractf128 [%3+%2*2+0x10], ymm4, 0 + vextractf128 [%3+%2*2+0x20], ymm6, 1 + vextractf128 [%3+%2*2+0x30], ymm4, 1 + sub %2, 0x20 + add %1, 0x20 + jl .post +%elif cpuflag(sse) + movaps xmm1, [%3+%1*2] + movaps xmm0, [%3+%1*2+0x10] + CMUL %1, xmm0, xmm1, %3, %4, %5 + movaps xmm5, [%3+%2*2] + movaps xmm4, [%3+%2*2+0x10] + CMUL %2, xmm4, xmm5, %3, %4, %5 + shufps xmm1, xmm1, 0x1b + shufps xmm5, xmm5, 0x1b + movaps xmm6, xmm4 + unpckhps xmm4, xmm1 + unpcklps xmm6, xmm1 + movaps xmm2, xmm0 + unpcklps xmm0, xmm5 + unpckhps xmm2, xmm5 + movaps [%3+%2*2], xmm6 + movaps [%3+%2*2+0x10], xmm4 + movaps [%3+%1*2], xmm0 + movaps [%3+%1*2+0x10], xmm2 + sub %2, 0x10 + add %1, 0x10 + jl .post +%elif cpuflag(3dnow) + CMUL %3, %1, m0, m1, %4, %5 + CMUL %3, %2, m2, m3, %4, %5 + movd [%3+%1*2+ 0], m0 + movd [%3+%2*2+12], m1 + movd [%3+%2*2+ 0], m2 + movd [%3+%1*2+12], m3 + psrlq m0, 32 + psrlq m1, 32 + psrlq m2, 32 + psrlq m3, 32 + movd [%3+%1*2+ 8], m0 + movd [%3+%2*2+ 4], m1 + movd [%3+%2*2+ 8], m2 + movd [%3+%1*2+ 4], m3 + sub %2, 8 + add %1, 8 + jl .post +%endif +%endmacro + +%macro DECL_IMDCT 0 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%if ARCH_X86_64 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 +%else +%define rrevtab r6 +%define rtsin r6 +%define rtcos r5 +%endif + mov r3d, [r0+FFTContext.mdctsize] + add r2, r3 + shr r3, 1 + mov rtcos, [r0+FFTContext.tcos] + mov rtsin, [r0+FFTContext.tsin] + add rtcos, r3 + add rtsin, r3 +%if ARCH_X86_64 == 0 + push rtcos + push rtsin +%endif + shr r3, 1 + mov rrevtab, [r0+FFTContext.revtab] + add rrevtab, r3 +%if ARCH_X86_64 == 0 + push rrevtab +%endif + +%if mmsize == 8 + sub r3, 2 +%else + sub r3, 4 +%endif +%if ARCH_X86_64 || mmsize == 8 + xor r4, r4 + sub r4, r3 +%endif +%if notcpuflag(3dnowext) && mmsize == 8 + movd m7, [ps_neg] +%endif +.pre: +%if ARCH_X86_64 == 0 +;unspill +%if mmsize != 8 + xor r4, r4 + sub r4, r3 +%endif + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + + PREROTATER r4, r3, r2, rtcos, rtsin +%if mmsize == 8 + mov r6, [esp] ; rrevtab = ptr+n8 + movzx r5, word [rrevtab+r4-2] ; rrevtab[j] + movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] + mova [r1+r5*8], m0 + mova [r1+r6*8], m2 + add r4, 2 + sub r3, 2 +%else +%if ARCH_X86_64 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 + add r4, 4 +%else + mov r6, [esp] + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 +%endif + sub r3, 4 +%endif + jns .pre + + mov r5, r0 + mov r6, r1 + mov r0, r1 + mov r1d, [r5+FFTContext.nbits] + + FFT_DISPATCH SUFFIX, r1 + + mov r0d, [r5+FFTContext.mdctsize] + add r6, r0 + shr r0, 1 +%if ARCH_X86_64 == 0 +%define rtcos r2 +%define rtsin r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + neg r0 + mov r1, -mmsize + sub r1, r0 + POSROTATESHUF r0, r1, r6, rtcos, rtsin +%if ARCH_X86_64 == 0 + add esp, 12 +%endif +%if mmsize == 8 + femms +%endif + RET +%endmacro + +DECL_IMDCT + +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_IMDCT + +INIT_MMX 3dnowext +DECL_IMDCT +%endif + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_IMDCT +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm new file mode 100644 index 000000000..2a2cdbd21 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm @@ -0,0 +1,221 @@ +;****************************************************************************** +;* SIMD optimized non-power-of-two MDCT functions +;* +;* Copyright (C) 2017 Rostislav Pehlivanov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0 +perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2 +sign_adjust_r: times 4 dd 0x80000000, 0x00000000 + +sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000 + +SECTION .text + +%if ARCH_X86_64 + +;***************************************************************************************** +;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride); +;***************************************************************************************** +%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2 + VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im + movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0 + movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0 + movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im + movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im + + subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re + addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im + + movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im + addps %2, xm1 + addps %2, xm0 ; DC[0].re, DC[0].im, junk... + movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im + + shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im + shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im + + mulps xm%3, xm1, xm5 + mulps xm4, xm3, xm6 + mulps xm1, xm6 + + xorps xm1, xm7 + mulps xm3, xm5 + addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im + subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im + + movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im + movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im + + xorps xm2, xm7 + addps xm%3, xm2, xm3 + subps xm3, xm2 + + shufps xm3, xm3, q1032 + vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4]) + addps m%3, m%3, m0 ; Finally offset with DCs +%endmacro + +%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset + mulps xm0, xm9, [exptabq + %1 + 16*0] + mulps xm1, xm10, [exptabq + %1 + 16*1] + + haddps xm0, xm1 + movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im + + addps xm0, xm1 + addps xm0, xm8 + + movsd [outq], xm0 +%endmacro + +%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset + mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1] + mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1] + mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1] + mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1] + + addps m0, m0, m2 + addps m1, m1, m3 + addps m0, m0, m11 + + shufps m1, m1, m1, q2301 + addps m0, m0, m1 + + vextractf128 xm1, m0, 1 + + movlps [outq + strideq*1], xm0 + movhps [outq + strideq*2], xm0 + movlps [outq + stride3q], xm1 + movhps [outq + strideq*4], xm1 +%endmacro + +INIT_YMM avx +cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5 + shl strideq, 3 + + movaps xm5, [exptabq + 480 + 16*0] + movaps xm6, [exptabq + 480 + 16*1] + movaps xm7, [sign_adjust_5] + + FFT5 0, xm8, 11 + FFT5 8, xm9, 12 + FFT5 16, xm10, 13 + +%define stride3q inq + lea stride3q, [strideq + strideq*2] + lea stride5q, [strideq + strideq*4] + + BUTTERFLIES_DC (8*6 + 4*0)*2*4 + BUTTERFLIES_AC (8*0 + 0*0)*2*4 + + add outq, stride5q + BUTTERFLIES_DC (8*6 + 4*1)*2*4 + BUTTERFLIES_AC (8*2 + 0*0)*2*4 + + add outq, stride5q + BUTTERFLIES_DC (8*6 + 4*2)*2*4 + BUTTERFLIES_AC (8*4 + 0*0)*2*4 + + RET + +%endif ; ARCH_X86_64 + +;******************************************************************************************************* +;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8); +;******************************************************************************************************* +%macro LUT_LOAD_4D 3 + mov r4d, [lutq + %3q*4 + 0] + movsd xmm%1, [inq + r4q*8] + mov r4d, [lutq + %3q*4 + 4] + movhps xmm%1, [inq + r4q*8] +%if cpuflag(avx2) + mov r4d, [lutq + %3q*4 + 8] + movsd %2, [inq + r4q*8] + mov r4d, [lutq + %3q*4 + 12] + movhps %2, [inq + r4q*8] + vinsertf128 %1, %1, %2, 1 +%endif +%endmacro + +%macro POSTROTATE_FN 1 +cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n + + xor offset_nq, offset_nq + lea offset_pq, [len8q*2 - %1] + + movaps m7, [sign_adjust_r] + +%if cpuflag(avx2) + movaps m8, [perm_pos] + movaps m9, [perm_neg] +%endif + +.loop: + movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im + movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im + + LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im + LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im + + mulps m5, m3, m0 ; in[p].reim * exp[p].reim + mulps m6, m4, m1 ; in[n].reim * exp[n].reim + + xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1 + xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1 + + shufps m3, m3, m3, q2301 ; in[p].imre + shufps m4, m4, m4, q2301 ; in[n].imre + + mulps m3, m0 ; in[p].imre * exp[p].reim + mulps m4, m1 ; in[n].imre * exp[n].reim + + haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re + haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im + +%if cpuflag(avx2) + vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re + vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im +%else + shufps m3, m3, m3, q0312 + shufps m5, m5, m5, q2130 +%endif + + movups [outq + offset_nq*8], m3 + movups [outq + offset_pq*8], m5 + + sub offset_pq, %1 + add offset_nq, %1 + cmp offset_nq, offset_pq + jle .loop + + REP_RET +%endmacro + +INIT_XMM sse3 +POSTROTATE_FN 2 + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +POSTROTATE_FN 4 +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm new file mode 100644 index 000000000..62bbe512e --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm @@ -0,0 +1,548 @@ +;****************************************************************************** +;* AAC Spectral Band Replication decoding functions +;* Copyright (C) 2012 Christophe Gisquet +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +; mask equivalent for multiply by -1.0 1.0 +ps_mask times 2 dd 1<<31, 0 +ps_mask2 times 2 dd 0, 1<<31 +ps_mask3 dd 0, 0, 0, 1<<31 +ps_noise0 times 2 dd 1.0, 0.0, +ps_noise2 times 2 dd -1.0, 0.0 +ps_noise13 dd 0.0, 1.0, 0.0, -1.0 + dd 0.0, -1.0, 0.0, 1.0 + dd 0.0, 1.0, 0.0, -1.0 +cextern sbr_noise_table +cextern ps_neg + +SECTION .text + +INIT_XMM sse +cglobal sbr_sum_square, 2, 3, 6 + mov r2d, r1d + xorps m0, m0 + xorps m1, m1 + sar r2, 3 + jz .prepare +.loop: + movu m2, [r0 + 0] + movu m3, [r0 + 16] + movu m4, [r0 + 32] + movu m5, [r0 + 48] + mulps m2, m2 + mulps m3, m3 + mulps m4, m4 + mulps m5, m5 + addps m0, m2 + addps m1, m3 + addps m0, m4 + addps m1, m5 + add r0, 64 + dec r2 + jnz .loop +.prepare: + and r1, 7 + sar r1, 1 + jz .end +; len is a multiple of 2, thus there are at least 4 elements to process +.endloop: + movu m2, [r0] + add r0, 16 + mulps m2, m2 + dec r1 + addps m0, m2 + jnz .endloop +.end: + addps m0, m1 + movhlps m2, m0 + addps m0, m2 + movss m1, m0 + shufps m0, m0, 1 + addss m0, m1 +%if ARCH_X86_64 == 0 + movss r0m, m0 + fld dword r0m +%endif + RET + +%define STEP 40*4*2 +cglobal sbr_hf_g_filt, 5, 6, 5 + lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high + mov r5, r3 + and r3, 0xFC + lea r2, [r2 + r3*4] + lea r0, [r0 + r3*8] + neg r3 + jz .loop1 +.loop4: + movlps m0, [r2 + 4*r3 + 0] + movlps m1, [r2 + 4*r3 + 8] + movlps m2, [r1 + 0*STEP] + movlps m3, [r1 + 2*STEP] + movhps m2, [r1 + 1*STEP] + movhps m3, [r1 + 3*STEP] + unpcklps m0, m0 + unpcklps m1, m1 + mulps m0, m2 + mulps m1, m3 + movu [r0 + 8*r3 + 0], m0 + movu [r0 + 8*r3 + 16], m1 + add r1, 4*STEP + add r3, 4 + jnz .loop4 + and r5, 3 ; number of single element loops + jz .end +.loop1: ; element 0 and 1 can be computed at the same time + movss m0, [r2] + movlps m2, [r1] + unpcklps m0, m0 + mulps m2, m0 + movlps [r0], m2 + add r0, 8 + add r2, 4 + add r1, STEP + dec r5 + jnz .loop1 +.end: + RET + +; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], +; const float alpha0[2], const float alpha1[2], +; float bw, int start, int end) +; +cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E + ; load alpha factors +%define bw m0 +%if ARCH_X86_64 == 0 || WIN64 + movss bw, BWm +%endif + movlps m2, [alpha1q] + movlps m1, [alpha0q] + shufps bw, bw, 0 + mulps m2, bw ; (a1[0] a1[1])*bw + mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) + mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) + mova m3, m1 + mova m4, m2 + + ; Set pointers +%if ARCH_X86_64 == 0 || WIN64 + ; start and end 6th and 7th args on stack + mov r2d, Sm + mov r3d, Em + DEFINE_ARGS X_high, X_low, start, end +%else +; BW does not actually occupy a register, so shift by 1 + DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end + movsxd startq, startd + movsxd endq, endd +%endif + sub startq, endq ; neg num of loops + lea X_highq, [X_highq + endq*2*4] + lea X_lowq, [X_lowq + endq*2*4 - 2*2*4] + shl startq, 3 ; offset from num loops + + mova m0, [X_lowq + startq] + shufps m3, m3, q1111 + shufps m4, m4, q1111 + xorps m3, [ps_mask] + shufps m1, m1, q0000 + shufps m2, m2, q0000 + xorps m4, [ps_mask] +.loop2: + movu m7, [X_lowq + startq + 8] ; BbCc + mova m6, m0 + mova m5, m7 + shufps m0, m0, q2301 ; aAbB + shufps m7, m7, q2301 ; bBcC + mulps m0, m4 + mulps m7, m3 + mulps m6, m2 + mulps m5, m1 + addps m7, m0 + mova m0, [X_lowq + startq + 16] ; CcDd + addps m7, m0 + addps m6, m5 + addps m7, m6 + mova [X_highq + startq], m7 + add startq, 16 + jnz .loop2 + RET + +cglobal sbr_sum64x5, 1,2,4,z + lea r1q, [zq+ 256] +.loop: + mova m0, [zq+ 0] + mova m2, [zq+ 16] + mova m1, [zq+ 256] + mova m3, [zq+ 272] + addps m0, [zq+ 512] + addps m2, [zq+ 528] + addps m1, [zq+ 768] + addps m3, [zq+ 784] + addps m0, [zq+1024] + addps m2, [zq+1040] + addps m0, m1 + addps m2, m3 + mova [zq], m0 + mova [zq+16], m2 + add zq, 32 + cmp zq, r1q + jne .loop + REP_RET + +INIT_XMM sse +cglobal sbr_qmf_post_shuffle, 2,3,4,W,z + lea r2q, [zq + (64-4)*4] + mova m3, [ps_neg] +.loop: + mova m1, [zq] + xorps m0, m3, [r2q] + shufps m0, m0, m0, q0123 + unpcklps m2, m0, m1 + unpckhps m0, m0, m1 + mova [Wq + 0], m2 + mova [Wq + 16], m0 + add Wq, 32 + sub r2q, 16 + add zq, 16 + cmp zq, r2q + jl .loop + REP_RET + +INIT_XMM sse +cglobal sbr_neg_odd_64, 1,2,4,z + lea r1q, [zq+256] +.loop: + mova m0, [zq+ 0] + mova m1, [zq+16] + mova m2, [zq+32] + mova m3, [zq+48] + xorps m0, [ps_mask2] + xorps m1, [ps_mask2] + xorps m2, [ps_mask2] + xorps m3, [ps_mask2] + mova [zq+ 0], m0 + mova [zq+16], m1 + mova [zq+32], m2 + mova [zq+48], m3 + add zq, 64 + cmp zq, r1q + jne .loop + REP_RET + +; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) +%macro SBR_QMF_DEINT_BFLY 0 +cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c + mov cq, 64*4-2*mmsize + lea vrevq, [vq + 64*4] +.loop: + mova m0, [src0q+cq] + mova m1, [src1q] + mova m4, [src0q+cq+mmsize] + mova m5, [src1q+mmsize] +%if cpuflag(sse2) + pshufd m2, m0, q0123 + pshufd m3, m1, q0123 + pshufd m6, m4, q0123 + pshufd m7, m5, q0123 +%else + shufps m2, m0, m0, q0123 + shufps m3, m1, m1, q0123 + shufps m6, m4, m4, q0123 + shufps m7, m5, m5, q0123 +%endif + addps m5, m2 + subps m0, m7 + addps m1, m6 + subps m4, m3 + mova [vrevq], m1 + mova [vrevq+mmsize], m5 + mova [vq+cq], m0 + mova [vq+cq+mmsize], m4 + add src1q, 2*mmsize + add vrevq, 2*mmsize + sub cq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +cglobal sbr_qmf_pre_shuffle, 1,4,6,z +%define OFFSET (32*4-2*mmsize) + mov r3q, OFFSET + lea r1q, [zq + (32+1)*4] + lea r2q, [zq + 64*4] + mova m5, [ps_neg] +.loop: + movu m0, [r1q] + movu m2, [r1q + mmsize] + movu m1, [zq + r3q + 4 + mmsize] + movu m3, [zq + r3q + 4] + + pxor m2, m5 + pxor m0, m5 + pshufd m2, m2, q0123 + pshufd m0, m0, q0123 + SBUTTERFLY dq, 2, 3, 4 + SBUTTERFLY dq, 0, 1, 4 + mova [r2q + 2*r3q + 0*mmsize], m2 + mova [r2q + 2*r3q + 1*mmsize], m3 + mova [r2q + 2*r3q + 2*mmsize], m0 + mova [r2q + 2*r3q + 3*mmsize], m1 + add r1q, 2*mmsize + sub r3q, 2*mmsize + jge .loop + movq m2, [zq] + movq [r2q], m2 + REP_RET + +%ifdef PIC +%define NREGS 1 +%if UNIX64 +%define NOISE_TABLE r6q ; r5q is m_max +%else +%define NOISE_TABLE r5q +%endif +%else +%define NREGS 0 +%define NOISE_TABLE sbr_noise_table +%endif + +%macro LOAD_NST 1 +%ifdef PIC + lea NOISE_TABLE, [%1] + mova m0, [kxq + NOISE_TABLE] +%else + mova m0, [kxq + %1] +%endif +%endmacro + +INIT_XMM sse2 +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise0] + jmp apply_noise_main + +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13 + jmp apply_noise_main + +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise2] + jmp apply_noise_main + +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13+16 + +apply_noise_main: +%if ARCH_X86_64 == 0 || WIN64 + mov kxd, m_maxm + DEFINE_ARGS Y, s_m, q_filt, noise, count +%else + DEFINE_ARGS Y, s_m, q_filt, noise, kx, count +%endif + movsxdifnidn noiseq, noised + dec noiseq + shl countd, 2 +%ifdef PIC + lea NOISE_TABLE, [sbr_noise_table] +%endif + lea Yq, [Yq + 2*countq] + add s_mq, countq + add q_filtq, countq + shl noiseq, 3 + pxor m5, m5 + neg countq +.loop: + mova m1, [q_filtq + countq] + movu m3, [noiseq + NOISE_TABLE + 1*mmsize] + movu m4, [noiseq + NOISE_TABLE + 2*mmsize] + add noiseq, 2*mmsize + and noiseq, 0x1ff<<3 + punpckhdq m2, m1, m1 + punpckldq m1, m1 + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mova m3, [s_mq + countq] + ; TODO: replace by a vpermd in AVX2 + punpckhdq m4, m3, m3 + punpckldq m3, m3 + pcmpeqd m6, m3, m5 ; m6 == 0 + pcmpeqd m7, m4, m5 ; m7 == 0 + mulps m3, m0 ; s_m[m] * phi_sign + mulps m4, m0 ; s_m[m] * phi_sign + pand m1, m6 + pand m2, m7 + movu m6, [Yq + 2*countq] + movu m7, [Yq + 2*countq + mmsize] + addps m3, m1 + addps m4, m2 + addps m6, m3 + addps m7, m4 + movu [Yq + 2*countq], m6 + movu [Yq + 2*countq + mmsize], m7 + add countq, mmsize + jl .loop + RET + +INIT_XMM sse +cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c +%define COUNT 32*4 +%define OFFSET 32*4 + mov cq, -COUNT + lea vrevq, [vq + OFFSET + COUNT] + add vq, OFFSET-mmsize + add srcq, 2*COUNT + mova m3, [ps_neg] +.loop: + mova m0, [srcq + 2*cq + 0*mmsize] + mova m1, [srcq + 2*cq + 1*mmsize] + shufps m2, m0, m1, q2020 + shufps m1, m0, q1313 + xorps m2, m3 + mova [vq], m1 + mova [vrevq + cq], m2 + sub vq, mmsize + add cq, mmsize + jl .loop + REP_RET + +%macro SBR_AUTOCORRELATE 0 +cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt + mov cntq, 37*8 + add xq, cntq + neg cntq + +%if cpuflag(sse3) +%define MOVH movsd + movddup m5, [xq+cntq] +%else +%define MOVH movlps + movlps m5, [xq+cntq] + movlhps m5, m5 +%endif + MOVH m7, [xq+cntq+8 ] + MOVH m1, [xq+cntq+16] + shufps m7, m7, q0110 + shufps m1, m1, q0110 + mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] + mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1]; + mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0] + movaps [rsp ], m3 + movaps [rsp+16], m4 + add cntq, 8 + + MOVH m2, [xq+cntq+16] + movlhps m7, m7 + shufps m2, m2, q0110 + mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0] + mulps m4, m7, m2 + mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1]; + addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0] + +align 16 +.loop: + add cntq, 8 + MOVH m0, [xq+cntq+16] + movlhps m1, m1 + shufps m0, m0, q0110 + mulps m3, m1, m2 + mulps m4, m1, m0 + mulps m1, m1 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + add cntq, 8 + MOVH m1, [xq+cntq+16] + movlhps m2, m2 + shufps m1, m1, q0110 + mulps m3, m2, m0 + mulps m4, m2, m1 + mulps m2, m2 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + add cntq, 8 + MOVH m2, [xq+cntq+16] + movlhps m0, m0 + shufps m2, m2, q0110 + mulps m3, m0, m1 + mulps m4, m0, m2 + mulps m0, m0 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + jl .loop + + movlhps m1, m1 + mulps m2, m1 + mulps m1, m1 + addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0]; + addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1]; + addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0]; + addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1]; + + xorps m2, [ps_mask3] + xorps m5, [ps_mask3] + xorps m6, [ps_mask3] + HADDPS m2, m5, m3 + HADDPS m7, m6, m4 +%if cpuflag(sse3) + movshdup m0, m1 +%else + movss m0, m1 + shufps m1, m1, q0001 +%endif + addss m1, m0 + movaps [phiq ], m2 + movhps [phiq+0x18], m7 + movss [phiq+0x28], m7 + movss [phiq+0x10], m1 + RET +%endmacro + +INIT_XMM sse +SBR_AUTOCORRELATE +INIT_XMM sse3 +SBR_AUTOCORRELATE diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/aarch64/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/aarch64/Makefile new file mode 100644 index 000000000..5613813ba --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/aarch64/Makefile @@ -0,0 +1,4 @@ +OBJS += aarch64/cpu.o \ + aarch64/float_dsp_init.o \ + +NEON-OBJS += aarch64/float_dsp_neon.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/arm/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/arm/Makefile new file mode 100644 index 000000000..5da44b054 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/arm/Makefile @@ -0,0 +1,8 @@ +OBJS += arm/cpu.o \ + arm/float_dsp_init_arm.o \ + +VFP-OBJS += arm/float_dsp_init_vfp.o \ + arm/float_dsp_vfp.o \ + +NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/ffversion.h b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/ffversion.h deleted file mode 100644 index 723c7d424..000000000 --- a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/ffversion.h +++ /dev/null @@ -1,5 +0,0 @@ -/* Automatically generated by version.sh, do not manually edit! */ -#ifndef AVUTIL_FFVERSION_H -#define AVUTIL_FFVERSION_H -#define FFMPEG_VERSION "" -#endif /* AVUTIL_FFVERSION_H */ diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/Makefile new file mode 100644 index 000000000..5f5242b5b --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/Makefile @@ -0,0 +1,18 @@ +OBJS += x86/cpu.o \ + x86/fixed_dsp_init.o \ + x86/float_dsp_init.o \ + x86/imgutils_init.o \ + x86/lls_init.o \ + +OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ + +EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o + +X86ASM-OBJS += x86/cpuid.o \ + $(EMMS_OBJS__yes_) \ + x86/fixed_dsp.o \ + x86/float_dsp.o \ + x86/imgutils.o \ + x86/lls.o \ + +X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \ diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/cpuid.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/cpuid.asm new file mode 100644 index 000000000..c3f7866ec --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/cpuid.asm @@ -0,0 +1,91 @@ +;***************************************************************************** +;* Copyright (C) 2005-2010 x264 project +;* +;* Authors: Loren Merritt +;* Fiona Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_cpuid, 5,7 + push rbx + push r4 + push r3 + push r2 + push r1 + mov eax, r0d + xor ecx, ecx + cpuid + pop r4 + mov [r4], eax + pop r4 + mov [r4], ebx + pop r4 + mov [r4], ecx + pop r4 + mov [r4], edx + pop rbx + RET + +;----------------------------------------------------------------------------- +; void ff_cpu_xgetbv(int op, int *eax, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_xgetbv, 3,7 + push r2 + push r1 + mov ecx, r0d + xgetbv + pop r4 + mov [r4], eax + pop r4 + mov [r4], edx + RET + +%if ARCH_X86_64 == 0 +;----------------------------------------------------------------------------- +; int ff_cpu_cpuid_test(void) +; return 0 if unsupported +;----------------------------------------------------------------------------- +cglobal cpu_cpuid_test + pushfd + push ebx + push ebp + push esi + push edi + pushfd + pop eax + mov ebx, eax + xor eax, 0x200000 + push eax + popfd + pushfd + pop eax + xor eax, ebx + pop edi + pop esi + pop ebp + pop ebx + popfd + ret +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/fixed_dsp.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/fixed_dsp.asm new file mode 100644 index 000000000..979dd5c33 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/fixed_dsp.asm @@ -0,0 +1,48 @@ +;***************************************************************************** +;* x86-optimized Float DSP functions +;* +;* Copyright 2016 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_butterflies_fixed(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal butterflies_fixed, 3,3,3, src0, src1, len + shl lend, 2 + add src0q, lenq + add src1q, lenq + neg lenq + +align 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + mova m2, m0 + paddd m0, m1 + psubd m2, m1 + mova [src0q + lenq], m0 + mova [src1q + lenq], m2 + add lenq, mmsize + jl .loop + RET diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/float_dsp.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/float_dsp.asm new file mode 100644 index 000000000..517fd6363 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/float_dsp.asm @@ -0,0 +1,484 @@ +;***************************************************************************** +;* x86-optimized Float DSP functions +;* +;* Copyright 2006 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +;----------------------------------------------------------------------------- +; void vector_fmul(float *dst, const float *src0, const float *src1, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL 0 +cglobal vector_fmul, 4,4,2, dst, src0, src1, len + lea lenq, [lend*4 - 64] +ALIGN 16 +.loop: +%assign a 0 +%rep 32/mmsize + mova m0, [src0q + lenq + (a+0)*mmsize] + mova m1, [src0q + lenq + (a+1)*mmsize] + mulps m0, m0, [src1q + lenq + (a+0)*mmsize] + mulps m1, m1, [src1q + lenq + (a+1)*mmsize] + mova [dstq + lenq + (a+0)*mmsize], m0 + mova [dstq + lenq + (a+1)*mmsize], m1 +%assign a a+2 +%endrep + + sub lenq, 64 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL +%endif + +;----------------------------------------------------------------------------- +; void vector_dmul(double *dst, const double *src0, const double *src1, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_DMUL 0 +cglobal vector_dmul, 4,4,4, dst, src0, src1, len + lea lend, [lenq*8 - mmsize*4] +ALIGN 16 +.loop: + movaps m0, [src0q + lenq + 0*mmsize] + movaps m1, [src0q + lenq + 1*mmsize] + movaps m2, [src0q + lenq + 2*mmsize] + movaps m3, [src0q + lenq + 3*mmsize] + mulpd m0, m0, [src1q + lenq + 0*mmsize] + mulpd m1, m1, [src1q + lenq + 1*mmsize] + mulpd m2, m2, [src1q + lenq + 2*mmsize] + mulpd m3, m3, [src1q + lenq + 3*mmsize] + movaps [dstq + lenq + 0*mmsize], m0 + movaps [dstq + lenq + 1*mmsize], m1 + movaps [dstq + lenq + 2*mmsize], m2 + movaps [dstq + lenq + 3*mmsize], m3 + + sub lenq, mmsize*4 + jge .loop + RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMUL +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMUL +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMAC_SCALAR 0 +%if UNIX64 +cglobal vector_fmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len +%endif +%if ARCH_X86_32 + VBROADCASTSS m0, mulm +%else +%if WIN64 + SWAP 0, 2 +%endif + shufps xm0, xm0, 0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*4-64] +.loop: +%if cpuflag(fma3) + mova m1, [dstq+lenq] + mova m2, [dstq+lenq+1*mmsize] + fmaddps m1, m0, [srcq+lenq], m1 + fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 +%else ; cpuflag + mulps m1, m0, [srcq+lenq] + mulps m2, m0, [srcq+lenq+1*mmsize] +%if mmsize < 32 + mulps m3, m0, [srcq+lenq+2*mmsize] + mulps m4, m0, [srcq+lenq+3*mmsize] +%endif ; mmsize + addps m1, m1, [dstq+lenq] + addps m2, m2, [dstq+lenq+1*mmsize] +%if mmsize < 32 + addps m3, m3, [dstq+lenq+2*mmsize] + addps m4, m4, [dstq+lenq+3*mmsize] +%endif ; mmsize +%endif ; cpuflag + mova [dstq+lenq], m1 + mova [dstq+lenq+1*mmsize], m2 +%if mmsize < 32 + mova [dstq+lenq+2*mmsize], m3 + mova [dstq+lenq+3*mmsize], m4 +%endif ; mmsize + sub lenq, 64 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_FMUL_SCALAR 0 +%if UNIX64 +cglobal vector_fmul_scalar, 3,3,2, dst, src, len +%else +cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len +%endif +%if ARCH_X86_32 + movss m0, mulm +%elif WIN64 + SWAP 0, 2 +%endif + shufps m0, m0, 0 + lea lenq, [lend*4-mmsize] +.loop: + mova m1, [srcq+lenq] + mulps m1, m0 + mova [dstq+lenq], m1 + sub lenq, mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_SCALAR + +;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ +; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMUL_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr + mov lenq, lenaddrm +%elif UNIX64 +cglobal vector_dmul_scalar, 3,3,3, dst, src, len +%else +cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len +%endif +%if ARCH_X86_32 + VBROADCASTSD m0, mulm +%else +%if WIN64 + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 ym0, ym0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-2*mmsize] +.loop: + mulpd m1, m0, [srcq+lenq ] + mulpd m2, m0, [srcq+lenq+mmsize] + movaps [dstq+lenq ], m1 + movaps [dstq+lenq+mmsize], m2 + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMUL_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMUL_SCALAR +%endif + +;----------------------------------------------------------------------------- +; vector_fmul_window(float *dst, const float *src0, +; const float *src1, const float *win, int len); +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_WINDOW 0 +cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 + shl lend, 2 + lea len1q, [lenq - mmsize] + add src0q, lenq + add dstq, lenq + add winq, lenq + neg lenq +.loop: + mova m0, [winq + lenq] + mova m4, [src0q + lenq] +%if cpuflag(sse) + mova m1, [winq + len1q] + mova m5, [src1q + len1q] + shufps m1, m1, 0x1b + shufps m5, m5, 0x1b + mova m2, m0 + mova m3, m1 + mulps m2, m4 + mulps m3, m5 + mulps m1, m4 + mulps m0, m5 + addps m2, m3 + subps m1, m0 + shufps m2, m2, 0x1b +%else + pswapd m1, [winq + len1q] + pswapd m5, [src1q + len1q] + mova m2, m0 + mova m3, m1 + pfmul m2, m4 + pfmul m3, m5 + pfmul m1, m4 + pfmul m0, m5 + pfadd m2, m3 + pfsub m1, m0 + pswapd m2, m2 +%endif + mova [dstq + lenq], m1 + mova [dstq + len1q], m2 + sub len1q, mmsize + add lenq, mmsize + jl .loop +%if mmsize == 8 + femms +%endif + REP_RET +%endmacro + +INIT_MMX 3dnowext +VECTOR_FMUL_WINDOW +INIT_XMM sse +VECTOR_FMUL_WINDOW + +;----------------------------------------------------------------------------- +; vector_fmul_add(float *dst, const float *src0, const float *src1, +; const float *src2, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_ADD 0 +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else + mulps m0, m0, [src1q + lenq] + mulps m1, m1, [src1q + lenq + mmsize] + addps m0, m0, [src2q + lenq] + addps m1, m1, [src2q + lenq + mmsize] +%endif + mova [dstq + lenq], m0 + mova [dstq + lenq + mmsize], m1 + + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_ADD +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL_ADD +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif + +;----------------------------------------------------------------------------- +; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, +; int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_REVERSE 0 +cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + movaps m2, [pd_reverse] +%endif + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: +%if cpuflag(avx2) + vpermps m0, m2, [src1q] + vpermps m1, m2, [src1q+mmsize] +%elif cpuflag(avx) + vmovaps xmm0, [src1q + 16] + vinsertf128 m0, m0, [src1q], 1 + vshufps m0, m0, m0, q0123 + vmovaps xmm1, [src1q + mmsize + 16] + vinsertf128 m1, m1, [src1q + mmsize], 1 + vshufps m1, m1, m1, q0123 +%else + mova m0, [src1q] + mova m1, [src1q + mmsize] + shufps m0, m0, q0123 + shufps m1, m1, q0123 +%endif + mulps m0, m0, [src0q + lenq + mmsize] + mulps m1, m1, [src0q + lenq] + movaps [dstq + lenq + mmsize], m0 + movaps [dstq + lenq], m1 + add src1q, 2*mmsize + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_REVERSE +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL_REVERSE +%endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif + +; float scalarproduct_float_sse(const float *v1, const float *v2, int len) +INIT_XMM sse +cglobal scalarproduct_float, 3,3,2, v1, v2, offset + shl offsetd, 2 + add v1q, offsetq + add v2q, offsetq + neg offsetq + xorps xmm0, xmm0 +.loop: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + js .loop + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xmm0 + fld dword r0m +%endif + RET + +;----------------------------------------------------------------------------- +; void ff_butterflies_float(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse +cglobal butterflies_float, 3,3,3, src0, src1, len + shl lend, 2 + add src0q, lenq + add src1q, lenq + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + mova [src1q + lenq], m2 + mova [src0q + lenq], m0 + add lenq, mmsize + jl .loop + REP_RET diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/imgutils.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/imgutils.asm new file mode 100644 index 000000000..3cca56cdc --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/imgutils.asm @@ -0,0 +1,53 @@ +;***************************************************************************** +;* Copyright 2016 Anton Khirnov +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +INIT_XMM sse4 +cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos + add dstq, bwq + add srcq, bwq + neg bwq + +.row_start: + mov rowposq, bwq + +.loop: + movntdqa m0, [srcq + rowposq + 0 * mmsize] + movntdqa m1, [srcq + rowposq + 1 * mmsize] + movntdqa m2, [srcq + rowposq + 2 * mmsize] + movntdqa m3, [srcq + rowposq + 3 * mmsize] + + mova [dstq + rowposq + 0 * mmsize], m0 + mova [dstq + rowposq + 1 * mmsize], m1 + mova [dstq + rowposq + 2 * mmsize], m2 + mova [dstq + rowposq + 3 * mmsize], m3 + + add rowposq, 4 * mmsize + jnz .loop + + add srcq, src_linesizeq + add dstq, dst_linesizeq + dec heightd + jnz .row_start + + RET diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/lls.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/lls.asm new file mode 100644 index 000000000..317fba6fc --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/lls.asm @@ -0,0 +1,290 @@ +;****************************************************************************** +;* linear least squares model +;* +;* Copyright (c) 2013 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +%define MAX_VARS 32 +%define MAX_VARS_ALIGN (MAX_VARS+4) +%define COVAR_STRIDE MAX_VARS_ALIGN*8 +%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] + +struc LLSModel + .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN + .coeff: resq MAX_VARS*MAX_VARS + .variance: resq MAX_VARS + .indep_count: resd 1 +endstruc + +%macro ADDPD_MEM 2 +%if cpuflag(avx) + vaddpd %2, %2, %1 +%else + addpd %2, %1 +%endif + mova %1, %2 +%endmacro + +INIT_XMM sse2 +%define movdqa movaps +cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 + %define covarq ctxq + mov id, [ctxq + LLSModel.indep_count] + lea varq, [varq + iq*8] + neg iq + mov covar2q, covarq +.loopi: + ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal + mova m1, [varq + iq*8] + mova m3, [varq + iq*8 + 16] + pshufd m4, m1, q1010 + pshufd m5, m1, q3232 + pshufd m6, m3, q1010 + pshufd m7, m3, q3232 + mulpd m0, m1, m4 + mulpd m1, m1, m5 + lea covarq, [covar2q + 16] + ADDPD_MEM COVAR(-2,0), m0 + ADDPD_MEM COVAR(-2,1), m1 + lea jq, [iq + 2] + cmp jd, -2 + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(0,0), m0 + ADDPD_MEM COVAR(0,1), m1 + ADDPD_MEM COVAR(0,2), m2 + ADDPD_MEM COVAR(0,3), m3 + mova m3, [varq + jq*8 + 16] + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(2,0), m0 + ADDPD_MEM COVAR(2,1), m1 + ADDPD_MEM COVAR(2,2), m2 + ADDPD_MEM COVAR(2,3), m3 + mova m3, [varq + jq*8 + 32] + add covarq, 32 + add jq, 4 + cmp jd, -2 + jle .loop4x4 +.skip4x4: + test jd, jd + jg .skip2x4 + mulpd m4, m3 + mulpd m5, m3 + mulpd m6, m3 + mulpd m7, m3 + ADDPD_MEM COVAR(0,0), m4 + ADDPD_MEM COVAR(0,1), m5 + ADDPD_MEM COVAR(0,2), m6 + ADDPD_MEM COVAR(0,3), m7 +.skip2x4: + add iq, 4 + add covar2q, 4*COVAR_STRIDE+32 + cmp id, -2 + jle .loopi + test id, id + jg .ret + mov jq, iq + %define covarq covar2q +.loop2x1: + movsd m0, [varq + iq*8] + movlhps m0, m0 + mulpd m0, [varq + jq*8] + ADDPD_MEM COVAR(0,0), m0 + inc iq + add covarq, COVAR_STRIDE + test id, id + jle .loop2x1 +.ret: + REP_RET + +%macro UPDATE_LLS 0 +cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 + %define covarq ctxq + mov countd, [ctxq + LLSModel.indep_count] + lea count2d, [countq-2] + xor id, id +.loopi: + ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal + mova ymm1, [varq + iq*8] + vbroadcastsd ymm4, [varq + iq*8] + vbroadcastsd ymm5, [varq + iq*8 + 8] + vbroadcastsd ymm6, [varq + iq*8 + 16] + vbroadcastsd ymm7, [varq + iq*8 + 24] + vextractf128 xmm3, ymm1, 1 +%if cpuflag(fma3) + mova ymm0, COVAR(iq ,0) + mova xmm2, COVAR(iq+2,2) + fmaddpd ymm0, ymm1, ymm4, ymm0 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) + mova COVAR(iq ,0), ymm0 + mova COVAR(iq ,1), ymm1 + mova COVAR(iq+2,2), xmm2 + mova COVAR(iq+2,3), xmm3 +%else + vmulpd ymm0, ymm1, ymm4 + vmulpd ymm1, ymm1, ymm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(iq ,0), ymm0 + ADDPD_MEM COVAR(iq ,1), ymm1 + ADDPD_MEM COVAR(iq+2,2), xmm2 + ADDPD_MEM COVAR(iq+2,3), xmm3 +%endif ; cpuflag(fma3) + lea jd, [iq + 4] + cmp jd, count2d + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mova ymm3, [varq + jq*8] +%if cpuflag(fma3) + mova ymm0, COVAR(jq, 0) + mova ymm1, COVAR(jq, 1) + mova ymm2, COVAR(jq, 2) + fmaddpd ymm0, ymm3, ymm4, ymm0 + fmaddpd ymm1, ymm3, ymm5, ymm1 + fmaddpd ymm2, ymm3, ymm6, ymm2 + fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) + mova COVAR(jq, 0), ymm0 + mova COVAR(jq, 1), ymm1 + mova COVAR(jq, 2), ymm2 + mova COVAR(jq, 3), ymm3 +%else + vmulpd ymm0, ymm3, ymm4 + vmulpd ymm1, ymm3, ymm5 + vmulpd ymm2, ymm3, ymm6 + vmulpd ymm3, ymm3, ymm7 + ADDPD_MEM COVAR(jq,0), ymm0 + ADDPD_MEM COVAR(jq,1), ymm1 + ADDPD_MEM COVAR(jq,2), ymm2 + ADDPD_MEM COVAR(jq,3), ymm3 +%endif ; cpuflag(fma3) + add jd, 4 + cmp jd, count2d + jle .loop4x4 +.skip4x4: + cmp jd, countd + jg .skip2x4 + mova xmm3, [varq + jq*8] +%if cpuflag(fma3) + mova xmm0, COVAR(jq, 0) + mova xmm1, COVAR(jq, 1) + mova xmm2, COVAR(jq, 2) + fmaddpd xmm0, xmm3, xmm4, xmm0 + fmaddpd xmm1, xmm3, xmm5, xmm1 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) + mova COVAR(jq, 0), xmm0 + mova COVAR(jq, 1), xmm1 + mova COVAR(jq, 2), xmm2 + mova COVAR(jq, 3), xmm3 +%else + vmulpd xmm0, xmm3, xmm4 + vmulpd xmm1, xmm3, xmm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(jq,0), xmm0 + ADDPD_MEM COVAR(jq,1), xmm1 + ADDPD_MEM COVAR(jq,2), xmm2 + ADDPD_MEM COVAR(jq,3), xmm3 +%endif ; cpuflag(fma3) +.skip2x4: + add id, 4 + add covarq, 4*COVAR_STRIDE + cmp id, count2d + jle .loopi + cmp id, countd + jg .ret + mov jd, id +.loop2x1: + vmovddup xmm0, [varq + iq*8] +%if cpuflag(fma3) + mova xmm1, [varq + jq*8] + fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) + mova COVAR(jq,0), xmm0 +%else + vmulpd xmm0, [varq + jq*8] + ADDPD_MEM COVAR(jq,0), xmm0 +%endif ; cpuflag(fma3) + inc id + add covarq, COVAR_STRIDE + cmp id, countd + jle .loop2x1 +.ret: + REP_RET +%endmacro ; UPDATE_LLS + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +UPDATE_LLS +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +UPDATE_LLS +%endif + +INIT_XMM sse2 +cglobal evaluate_lls, 3,4,2, ctx, var, order, i + ; This function is often called on the same buffer as update_lls, but with + ; an offset. They can't both be aligned. + ; Load halves rather than movu to avoid store-forwarding stalls, since the + ; input was initialized immediately prior to this function using scalar math. + %define coefsq ctxq + mov id, orderd + imul orderd, MAX_VARS + lea coefsq, [ctxq + LLSModel.coeff + orderq*8] + movsd m0, [varq] + movhpd m0, [varq + 8] + mulpd m0, [coefsq] + lea coefsq, [coefsq + iq*8] + lea varq, [varq + iq*8] + neg iq + add iq, 2 +.loop: + movsd m1, [varq + iq*8] + movhpd m1, [varq + iq*8 + 8] + mulpd m1, [coefsq + iq*8] + addpd m0, m1 + add iq, 2 + jl .loop + jg .skip1 + movsd m1, [varq + iq*8] + mulsd m1, [coefsq + iq*8] + addpd m0, m1 +.skip1: + movhlps m1, m0 + addsd m0, m1 +%if ARCH_X86_32 + movsd r0m, m0 + fld qword r0m +%endif + RET diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86inc.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86inc.asm new file mode 100644 index 000000000..5044ee86f --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86inc.asm @@ -0,0 +1,1701 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2018 x264 project +;* +;* Authors: Loren Merritt +;* Henrik Gramner +;* Anton Mitrofanov +;* Fiona Glaser +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%if HAVE_ALIGNED_STACK + %define STACK_ALIGNMENT 16 +%endif +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; aout does not support align= +; NOTE: This section is out of sync with x264, in order to +; keep supporting OS/2. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + SECTION .text + %elifidn __OUTPUT_FORMAT__,coff + SECTION .text + %elifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 + %else + SECTION .rodata align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +%macro CPUNOP 1 + %if HAVE_CPUNOP + CPU %1 + %endif +%endmacro + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. + %assign regs_used (regs_used + 1) + %if ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i 8 + %rep %%xmm_regs_on_stack + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 0 + %assign %%pad_size 0 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack + %assign %%i %%i-1 + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL + %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + %assign xmm_regs_used %3 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 0 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue || cpuflag(ssse3) + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function hidden + %else + global current_function %+ %1 + %endif + %1: +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<21) +%assign cpuflags_cache64 (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPUNOP amdnop + %else + CPUNOP basicnop + %endif +%endmacro + +; Merge mmx, sse*, and avx* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + INIT_CPUFLAGS %1 + DEFINE_MMREGS mm +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 +%endmacro + +%assign i 0 +%rep 32 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 32 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX/EVEX and legacy encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2 +AVX_INSTR sqrtps, sse +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86util.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86util.asm new file mode 100644 index 000000000..d7cd99684 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86util.asm @@ -0,0 +1,1028 @@ +;***************************************************************************** +;* x86util.asm +;***************************************************************************** +;* Copyright (C) 2008-2010 x264 project +;* +;* Authors: Loren Merritt +;* Holger Lubitz +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%define private_prefix ff +%define public_prefix avpriv +%define cpuflags_mmxext cpuflags_mmx2 + +%include "libavutil/x86/x86inc.asm" + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base3], \ + [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4] + +; Interleave low src0 with low src1 and store in src0, +; interleave high src0 with high src1 and store in src1. +; %1 - types +; %2 - index of the register with src0 +; %3 - index of the register with src1 +; %4 - index of the register for intermediate results +; example for %1 - wd: input: src0: x0 x1 x2 x3 z0 z1 z2 z3 +; src1: y0 y1 y2 y3 q0 q1 q2 q3 +; output: src0: x0 y0 x1 y1 x2 y2 x3 y3 +; src1: z0 q0 z1 q1 z2 q2 z3 q3 +%macro SBUTTERFLY 4 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 + vinserti128 m%2, m%2, xm%3, 1 +%elif avx_enabled == 0 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif + SWAP %3, %4 +%endmacro + +%macro SBUTTERFLY2 4 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 + SWAP %2, %4, %3 +%endmacro + +%macro SBUTTERFLYPS 3 + unpcklps m%3, m%1, m%2 + unpckhps m%1, m%1, m%2 + SWAP %1, %3, %2 +%endmacro + +%macro SBUTTERFLYPD 3 + movlhps m%3, m%1, m%2 + movhlps m%2, m%2, m%1 + SWAP %1, %3 +%endmacro + +%macro TRANSPOSE4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops +%macro TRANSPOSE4x4PS 5 + SBUTTERFLYPS %1, %2, %5 + SBUTTERFLYPS %3, %4, %5 + SBUTTERFLYPD %1, %3, %5 + SBUTTERFLYPD %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x4D 9-11 +%if ARCH_X86_64 + SBUTTERFLY dq, %1, %2, %9 + SBUTTERFLY dq, %3, %4, %9 + SBUTTERFLY dq, %5, %6, %9 + SBUTTERFLY dq, %7, %8, %9 + SBUTTERFLY qdq, %1, %3, %9 + SBUTTERFLY qdq, %2, %4, %9 + SBUTTERFLY qdq, %5, %7, %9 + SBUTTERFLY qdq, %6, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7 +; out: m0..m7, unless %11 in which case m2 is in %9 +; spills into %9 and %10 + movdqa %9, m%7 + SBUTTERFLY dq, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY dq, %3, %4, %2 + SBUTTERFLY dq, %5, %6, %2 + SBUTTERFLY dq, %7, %8, %2 + SBUTTERFLY qdq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY qdq, %2, %4, %3 + SBUTTERFLY qdq, %5, %7, %3 + SBUTTERFLY qdq, %6, %8, %3 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%3, %9 +%endif +%endif +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%if ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + +%macro TRANSPOSE16x16W 18-19 +; in: m0..m15, unless %19 in which case m6 is in %17 +; out: m0..m15, unless %19 in which case m4 is in %18 +; spills into %17 and %18 +%if %0 < 19 + mova %17, m%7 +%endif + + SBUTTERFLY dqqq, %1, %9, %7 + SBUTTERFLY dqqq, %2, %10, %7 + SBUTTERFLY dqqq, %3, %11, %7 + SBUTTERFLY dqqq, %4, %12, %7 + SBUTTERFLY dqqq, %5, %13, %7 + SBUTTERFLY dqqq, %6, %14, %7 + mova %18, m%14 + mova m%7, %17 + SBUTTERFLY dqqq, %7, %15, %14 + SBUTTERFLY dqqq, %8, %16, %14 + + SBUTTERFLY wd, %1, %2, %14 + SBUTTERFLY wd, %3, %4, %14 + SBUTTERFLY wd, %5, %6, %14 + SBUTTERFLY wd, %7, %8, %14 + SBUTTERFLY wd, %9, %10, %14 + SBUTTERFLY wd, %11, %12, %14 + mova %17, m%12 + mova m%14, %18 + SBUTTERFLY wd, %13, %14, %12 + SBUTTERFLY wd, %15, %16, %12 + + SBUTTERFLY dq, %1, %3, %12 + SBUTTERFLY dq, %2, %4, %12 + SBUTTERFLY dq, %5, %7, %12 + SBUTTERFLY dq, %6, %8, %12 + SBUTTERFLY dq, %9, %11, %12 + mova %18, m%11 + mova m%12, %17 + SBUTTERFLY dq, %10, %12, %11 + SBUTTERFLY dq, %13, %15, %11 + SBUTTERFLY dq, %14, %16, %11 + + SBUTTERFLY qdq, %1, %5, %11 + SBUTTERFLY qdq, %2, %6, %11 + SBUTTERFLY qdq, %3, %7, %11 + SBUTTERFLY qdq, %4, %8, %11 + + SWAP %2, %5 + SWAP %4, %7 + + SBUTTERFLY qdq, %9, %13, %11 + SBUTTERFLY qdq, %10, %14, %11 + mova m%11, %18 + mova %18, m%5 + SBUTTERFLY qdq, %11, %15, %5 + SBUTTERFLY qdq, %12, %16, %5 + +%if %0 < 19 + mova m%5, %18 +%endif + + SWAP %10, %13 + SWAP %12, %15 +%endmacro + +%macro TRANSPOSE_8X8B 8 + %if mmsize == 8 + %error "This macro does not support mmsize == 8" + %endif + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + punpcklbw m%5, m%6 + punpcklbw m%7, m%8 + TRANSPOSE4x4W %1, %3, %5, %7, %2 + MOVHL m%2, m%1 + MOVHL m%4, m%3 + MOVHL m%6, m%5 + MOVHL m%8, m%7 +%endmacro + +; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw %1, %2 +%elif cpuflag(mmxext) + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%else + pxor %1, %1 + pcmpgtw %1, %2 + pxor %2, %1 + psubw %2, %1 + SWAP %1, %2 +%endif +%endmacro + +%macro PSIGNW 2 +%if cpuflag(ssse3) + psignw %1, %2 +%else + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS1 2 +%if cpuflag(ssse3) + pabsw %1, %1 +%elif cpuflag(mmxext) ; a, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%else ; a, tmp + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS2 4 +%if cpuflag(ssse3) + pabsw %1, %1 + pabsw %2, %2 +%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%else ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + pcmpgtw %3, %1 + pcmpgtw %4, %2 + pxor %1, %3 + pxor %2, %4 + psubw %1, %3 + psubw %2, %4 +%endif +%endmacro + +%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 +%else + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endif +%endmacro + +%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 + pabsb %2, %2 +%else + pxor %3, %3 + pxor %4, %4 + psubb %3, %1 + psubb %4, %2 + pminub %1, %3 + pminub %2, %4 +%endif +%endmacro + +%macro ABSD2 4 + pxor %3, %3 + pxor %4, %4 + pcmpgtd %3, %1 + pcmpgtd %4, %2 + pxor %1, %3 + pxor %2, %4 + psubd %1, %3 + psubd %2, %4 +%endmacro + +%macro ABS4 6 + ABS2 %1, %2, %5, %6 + ABS2 %3, %4, %5, %6 +%endmacro + +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 + SPLATW %1, %1, 3 +%endif +%endmacro + +%macro SPLATB_REG 3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%else + movd %1, %2d + punpcklbw %1, %1 + SPLATW %1, %1, 0 +%endif +%endmacro + +%macro HADDD 2 ; sum junk +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif + movhlps %2, %1 + paddd %1, %2 +%endif +%if notcpuflag(xop) || sizeof%1 != 16 +%if cpuflag(mmxext) + PSHUFLW %2, %1, q0032 +%else ; mmx + mova %2, %1 + psrlq %2, 32 +%endif + paddd %1, %2 +%endif +%undef %1 +%undef %2 +%endmacro + +%macro HADDW 2 ; reg, tmp +%if cpuflag(xop) && sizeof%1 == 16 + vphaddwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + pmaddwd %1, [pw_1] + HADDD %1, %2 +%endif +%endmacro + +%macro HADDPS 3 ; dst, src, tmp +%if cpuflag(sse3) + haddps %1, %1, %2 +%else + movaps %3, %1 + shufps %1, %2, q2020 + shufps %3, %2, q3131 + addps %1, %3 +%endif +%endmacro + +%macro PALIGNR 4-5 +%if cpuflag(ssse3) +%if %0==5 + palignr %1, %2, %3, %4 +%else + palignr %1, %2, %3 +%endif +%else ; [dst,] src1, src2, imm, tmp + %define %%dst %1 +%if %0==5 +%ifnidn %1, %2 + mova %%dst, %2 +%endif + %rotate 1 +%endif +%ifnidn %4, %2 + mova %4, %2 +%endif +%if mmsize==8 + psllq %%dst, (8-%3)*8 + psrlq %4, %3*8 +%else + pslldq %%dst, 16-%3 + psrldq %4, %3 +%endif + por %%dst, %4 +%endif +%endmacro + +%macro PAVGB 2-4 +%if cpuflag(mmxext) + pavgb %1, %2 +%elif cpuflag(3dnow) + pavgusb %1, %2 +%elif cpuflag(mmx) + movu %3, %2 + por %3, %1 + pxor %1, %2 + pand %1, %4 + psrlq %1, 1 + psubb %3, %1 + SWAP %1, %3 +%endif +%endmacro + +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + +%macro PSWAPD 2 +%if cpuflag(mmxext) + pshufw %1, %2, q1032 +%elif cpuflag(3dnowext) + pswapd %1, %2 +%elif cpuflag(3dnow) + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + pand m%3, m%5, m%4 ; src .. y6 .. y4 + pand m%1, m%5, m%2 ; dst .. y6 .. y4 +%else + mova m%1, %5 + pand m%3, m%1, m%4 ; src .. y6 .. y4 + pand m%1, m%1, m%2 ; dst .. y6 .. y4 +%endif + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 3-4 +%if %0==3 + padd%1 m%2, m%3 + padd%1 m%3, m%3 + psub%1 m%3, m%2 +%else +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endif +%endmacro + +%macro SUMSUB_BADC 5-6 +%if %0==6 + SUMSUB_BA %1, %2, %3, %6 + SUMSUB_BA %1, %4, %5, %6 +%else + padd%1 m%2, m%3 + padd%1 m%4, m%5 + padd%1 m%3, m%3 + padd%1 m%5, m%5 + psub%1 m%3, m%2 + psub%1 m%5, m%4 +%endif +%endmacro + +%macro SUMSUB2_AB 4 +%ifnum %3 + psub%1 m%4, m%2, m%3 + psub%1 m%4, m%3 + padd%1 m%2, m%2 + padd%1 m%2, m%3 +%else + mova m%4, m%2 + padd%1 m%2, m%2 + padd%1 m%2, %3 + psub%1 m%4, %3 + psub%1 m%4, %3 +%endif +%endmacro + +%macro SUMSUB2_BA 4 +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + padd%1 m%2, m%3 + psub%1 m%3, m%4 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + padd%1 m%4, m%3 + psub%1 m%3, m%2 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endmacro + +%macro SUMSUBD2_AB 5 +%ifnum %4 + psra%1 m%5, m%2, 1 ; %3: %3>>1 + psra%1 m%4, m%3, 1 ; %2: %2>>1 + padd%1 m%4, m%2 ; %3: %3>>1+%2 + psub%1 m%5, m%3 ; %2: %2>>1-%3 + SWAP %2, %5 + SWAP %3, %4 +%else + mova %5, m%2 + mova %4, m%3 + psra%1 m%3, 1 ; %3: %3>>1 + psra%1 m%2, 1 ; %2: %2>>1 + padd%1 m%3, %5 ; %3: %3>>1+%2 + psub%1 m%2, %4 ; %2: %2>>1-%3 +%endif +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC w, %4, %1, %3, %2, %5 + SUMSUB_BA w, %3, %4, %5 + SUMSUB2_AB w, %1, %2, %5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC w, %4, %1, %3, %2 + SUMSUB_BA w, %3, %4 + mova [%5], m%2 + SUMSUB2_AB w, %1, [%5], %2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 6-7 +%ifnum %6 + SUMSUBD2_AB %1, %3, %5, %7, %6 + ; %3: %3>>1-%5 %5: %3+%5>>1 + SUMSUB_BA %1, %4, %2, %7 + ; %4: %2+%4 %2: %2-%4 + SUMSUB_BADC %1, %5, %4, %3, %2, %7 + ; %5: %2+%4 + (%3+%5>>1) + ; %4: %2+%4 - (%3+%5>>1) + ; %3: %2-%4 + (%3>>1-%5) + ; %2: %2-%4 - (%3>>1-%5) +%else +%ifidn %1, w + SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] +%else + SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] +%endif + SUMSUB_BA %1, %4, %2 + SUMSUB_BADC %1, %5, %4, %3, %2 +%endif + SWAP %2, %5, %4 + ; %2: %2+%4 + (%3+%5>>1) row0 + ; %3: %2-%4 + (%3>>1-%5) row1 + ; %4: %2-%4 - (%3>>1-%5) row2 + ; %5: %2+%4 - (%3+%5>>1) row3 +%endmacro + + +%macro LOAD_DIFF 5 +%ifidn %3, none + movh %1, %4 + movh %2, %5 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%else + movh %1, %4 + punpcklbw %1, %3 + movh %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endif +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +%macro STORE_DIFF 4 + movh %2, %4 + punpcklbw %2, %3 + psraw %1, 6 + paddsw %1, %2 + packuswb %1, %1 + movh %4, %1 +%endmacro + +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movh %3, [%7] + movh %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movh [%7], %3 + movh [%7+%8], %4 +%endmacro + +%macro PMINUB 3 ; dst, src, ignored +%if cpuflag(mmxext) + pminub %1, %2 +%else ; dst, src, tmp + mova %3, %1 + psubusb %3, %2 + psubb %1, %3 +%endif +%endmacro + +%macro SPLATW 2-3 0 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%elif mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%elif cpuflag(mmxext) + pshufw %1, %2, (%3)*0x55 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + %if %3 & 2 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif + %if %3 & 1 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif +%endif +%endmacro + +%macro SPLATD 1 +%if mmsize == 8 + punpckldq %1, %1 +%elif cpuflag(sse2) + pshufd %1, %1, 0 +%elif cpuflag(sse) + shufps %1, %1, 0 +%endif +%endmacro + +%macro CLIPUB 3 ;(dst, min, max) + pmaxub %1, %2 + pminub %1, %3 +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro + +%macro PMINSD 3 ; dst, src, tmp/unused +%if cpuflag(sse4) + pminsd %1, %2 +%elif cpuflag(sse2) + cvtdq2ps %1, %1 + minps %1, %2 + cvtps2dq %1, %1 +%else + mova %3, %2 + pcmpgtd %3, %1 + pxor %1, %2 + pand %1, %3 + pxor %1, %2 +%endif +%endmacro + +%macro PMAXSD 3 ; dst, src, tmp/unused +%if cpuflag(sse4) + pmaxsd %1, %2 +%else + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro CLIPD 3-4 +%if cpuflag(sse4); src/dst, min, max, unused + pminsd %1, %3 + pmaxsd %1, %2 +%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused + cvtdq2ps %1, %1 + minps %1, %3 + maxps %1, %2 + cvtps2dq %1, %1 +%else ; src/dst, min, max, tmp + PMINSD %1, %3, %4 + PMAXSD %1, %2, %4 +%endif +%endmacro + +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif +%endif +%endmacro + +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 +%if cpuflag(avx) && mmsize == 32 + vbroadcastsd %1, %2 +%elif cpuflag(sse3) + movddup %1, %2 +%else ; sse2 + movsd %1, %2 + movlhps %1, %1 +%endif +%endmacro + +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + +%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val +%if mmsize > 16 + vbroadcasti128 %1, %2 +%else + mova %1, %2 +%endif +%endmacro + +%macro SHUFFLE_MASK_W 8 + %rep 8 + %if %1>=0x80 + db %1, %1 + %else + db %1*2 + db %1*2+1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro PMOVSXWD 2; dst, src +%if cpuflag(sse4) + pmovsxwd %1, %2 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + punpcklwd %1, %1 + psrad %1, 16 +%endif +%endmacro + +; Wrapper for non-FMA version of fmaddps +%macro FMULADD_PS 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 + %elifidn %1, %4 + mulps %5, %2, %3 + addps %1, %4, %5 + %else + mulps %1, %2, %3 + addps %1, %4 + %endif +%endmacro + +%macro LSHIFT 2 +%if mmsize > 8 + pslldq %1, %2 +%else + psllq %1, 8*(%2) +%endif +%endmacro + +%macro RSHIFT 2 +%if mmsize > 8 + psrldq %1, %2 +%else + psrlq %1, 8*(%2) +%endif +%endmacro + +%macro MOVHL 2 ; dst, src +%ifidn %1, %2 + punpckhqdq %1, %2 +%elif cpuflag(avx) + punpckhqdq %1, %2, %2 +%elif cpuflag(sse4) + pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones +%else + movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst +%endif +%endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/aarch64/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/aarch64/Makefile new file mode 100644 index 000000000..5c34f8d94 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/aarch64/Makefile @@ -0,0 +1,7 @@ +OBJS += aarch64/audio_convert_init.o \ + aarch64/resample_init.o + +OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o + +NEON-OBJS += aarch64/audio_convert_neon.o \ + aarch64/resample.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/arm/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/arm/Makefile new file mode 100644 index 000000000..53ab4626f --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/arm/Makefile @@ -0,0 +1,8 @@ +OBJS += arm/audio_convert_init.o \ + arm/resample_init.o + + +OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o + +NEON-OBJS += arm/audio_convert_neon.o \ + arm/resample.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/Makefile b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/Makefile new file mode 100644 index 000000000..fa0641f03 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/Makefile @@ -0,0 +1,9 @@ +X86ASM-OBJS += x86/audio_convert.o\ + x86/rematrix.o\ + x86/resample.o\ + +OBJS += x86/audio_convert_init.o\ + x86/rematrix_init.o\ + x86/resample_init.o\ + +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/audio_convert.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/audio_convert.asm new file mode 100644 index 000000000..d441636d3 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/audio_convert.asm @@ -0,0 +1,739 @@ +;****************************************************************************** +;* Copyright (c) 2012 Michael Niedermayer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +flt2pm31: times 8 dd 4.6566129e-10 +flt2p31 : times 8 dd 2147483648.0 +flt2p15 : times 8 dd 32768.0 + +word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15 + +SECTION .text + + +;to, from, a/u, log2_outsize, log_intsize, const +%macro PACK_2CH 5-7 +cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2 + mov src2q , [srcq+gprsize] + mov srcq , [srcq] + mov dstq , [dstq] +%ifidn %3, a + test dstq, mmsize-1 + jne pack_2ch_%2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne pack_2ch_%2_to_%1_u_int %+ SUFFIX + test src2q, mmsize-1 + jne pack_2ch_%2_to_%1_u_int %+ SUFFIX +%else +pack_2ch_%2_to_%1_u_int %+ SUFFIX: +%endif + lea srcq , [srcq + (1<<%5)*lenq] + lea src2q, [src2q + (1<<%5)*lenq] + lea dstq , [dstq + (2<<%4)*lenq] + neg lenq + %7 m0,m1,m2,m3,m4,m5 +.next: +%if %4 >= %5 + mov%3 m0, [ srcq +(1<<%5)*lenq] + mova m1, m0 + mov%3 m2, [ src2q+(1<<%5)*lenq] +%if %5 == 1 + punpcklwd m0, m2 + punpckhwd m1, m2 +%else + punpckldq m0, m2 + punpckhdq m1, m2 +%endif + %6 m0,m1,m2,m3,m4,m5 +%else + mov%3 m0, [ srcq +(1<<%5)*lenq] + mov%3 m1, [mmsize + srcq +(1<<%5)*lenq] + mov%3 m2, [ src2q+(1<<%5)*lenq] + mov%3 m3, [mmsize + src2q+(1<<%5)*lenq] + %6 m0,m1,m2,m3,m4,m5 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + SWAP 1,2 +%endif + mov%3 [ dstq+(2<<%4)*lenq], m0 + mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1 +%if %4 > %5 + mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2 + mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3 + add lenq, 4*mmsize/(2<<%4) +%else + add lenq, 2*mmsize/(2<<%4) +%endif + jl .next + REP_RET +%endmacro + +%macro UNPACK_2CH 5-7 +cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2 + mov dst2q , [dstq+gprsize] + mov srcq , [srcq] + mov dstq , [dstq] +%ifidn %3, a + test dstq, mmsize-1 + jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX + test dst2q, mmsize-1 + jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX +%else +unpack_2ch_%2_to_%1_u_int %+ SUFFIX: +%endif + lea srcq , [srcq + (2<<%5)*lenq] + lea dstq , [dstq + (1<<%4)*lenq] + lea dst2q, [dst2q + (1<<%4)*lenq] + neg lenq + %7 m0,m1,m2,m3,m4,m5 + mova m6, [word_unpack_shuf] +.next: + mov%3 m0, [ srcq +(2<<%5)*lenq] + mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq] +%if %5 == 1 +%ifidn SUFFIX, _ssse3 + pshufb m0, m6 + mova m1, m0 + pshufb m2, m6 + punpcklqdq m0,m2 + punpckhqdq m1,m2 +%else + mova m1, m0 + punpcklwd m0,m2 + punpckhwd m1,m2 + + mova m2, m0 + punpcklwd m0,m1 + punpckhwd m2,m1 + + mova m1, m0 + punpcklwd m0,m2 + punpckhwd m1,m2 +%endif +%else + mova m1, m0 + shufps m0, m2, 10001000b + shufps m1, m2, 11011101b +%endif +%if %4 < %5 + mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq] + mova m3, m2 + mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq] + shufps m2, m4, 10001000b + shufps m3, m4, 11011101b + SWAP 1,2 +%endif + %6 m0,m1,m2,m3,m4,m5 + mov%3 [ dstq+(1<<%4)*lenq], m0 +%if %4 > %5 + mov%3 [ dst2q+(1<<%4)*lenq], m2 + mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 + mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3 + add lenq, 2*mmsize/(1<<%4) +%else + mov%3 [ dst2q+(1<<%4)*lenq], m1 + add lenq, mmsize/(1<<%4) +%endif + jl .next + REP_RET +%endmacro + +%macro CONV 5-7 +cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len + mov srcq , [srcq] + mov dstq , [dstq] +%ifidn %3, a + test dstq, mmsize-1 + jne %2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne %2_to_%1_u_int %+ SUFFIX +%else +%2_to_%1_u_int %+ SUFFIX: +%endif + lea srcq , [srcq + (1<<%5)*lenq] + lea dstq , [dstq + (1<<%4)*lenq] + neg lenq + %7 m0,m1,m2,m3,m4,m5 +.next: + mov%3 m0, [ srcq +(1<<%5)*lenq] + mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq] +%if %4 < %5 + mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq] + mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq] +%endif + %6 m0,m1,m2,m3,m4,m5 + mov%3 [ dstq+(1<<%4)*lenq], m0 + mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 +%if %4 > %5 + mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2 + mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3 + add lenq, 4*mmsize/(1<<%4) +%else + add lenq, 2*mmsize/(1<<%4) +%endif + jl .next +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +%macro PACK_6CH 8 +cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len +%if ARCH_X86_64 + mov lend, r2d +%else + %define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + mov dstq, [dstq] +%ifidn %3, a + test dstq, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test src1q, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test src2q, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test src3q, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test src4q, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX + test src5q, mmsize-1 + jne pack_6ch_%2_to_%1_u_int %+ SUFFIX +%else +pack_6ch_%2_to_%1_u_int %+ SUFFIX: +%endif + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq + %8 x,x,x,x,m7,x +.loop: + mov%3 m0, [srcq ] + mov%3 m1, [srcq+src1q] + mov%3 m2, [srcq+src2q] + mov%3 m3, [srcq+src3q] + mov%3 m4, [srcq+src4q] + mov%3 m5, [srcq+src5q] +%if cpuflag(sse) + SBUTTERFLYPS 0, 1, 6 + SBUTTERFLYPS 2, 3, 6 + SBUTTERFLYPS 4, 5, 6 + +%if cpuflag(avx) + blendps m6, m4, m0, 1100b +%else + movaps m6, m4 + shufps m4, m0, q3210 + SWAP 4,6 +%endif + movlhps m0, m2 + movhlps m4, m2 +%if cpuflag(avx) + blendps m2, m5, m1, 1100b +%else + movaps m2, m5 + shufps m5, m1, q3210 + SWAP 2,5 +%endif + movlhps m1, m3 + movhlps m5, m3 + + %7 m0,m6,x,x,m7,m3 + %7 m4,m1,x,x,m7,m3 + %7 m2,m5,x,x,m7,m3 + + mov %+ %3 %+ ps [dstq ], m0 + mov %+ %3 %+ ps [dstq+16], m6 + mov %+ %3 %+ ps [dstq+32], m4 + mov %+ %3 %+ ps [dstq+48], m1 + mov %+ %3 %+ ps [dstq+64], m2 + mov %+ %3 %+ ps [dstq+80], m5 +%else ; mmx + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + + movq [dstq ], m0 + movq [dstq+ 8], m2 + movq [dstq+16], m4 + movq [dstq+24], m1 + movq [dstq+32], m3 + movq [dstq+40], m5 +%endif + add srcq, mmsize + add dstq, mmsize*6 + sub lend, mmsize/4 + jg .loop +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +%macro UNPACK_6CH 8 +cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len +%if ARCH_X86_64 + mov lend, r2d +%else + %define lend dword r2m +%endif + mov dst1q, [dstq+1*gprsize] + mov dst2q, [dstq+2*gprsize] + mov dst3q, [dstq+3*gprsize] + mov dst4q, [dstq+4*gprsize] + mov dst5q, [dstq+5*gprsize] + mov dstq, [dstq] + mov srcq, [srcq] +%ifidn %3, a + test dstq, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test dst1q, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test dst2q, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test dst3q, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test dst4q, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX + test dst5q, mmsize-1 + jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX +%else +unpack_6ch_%2_to_%1_u_int %+ SUFFIX: +%endif + sub dst1q, dstq + sub dst2q, dstq + sub dst3q, dstq + sub dst4q, dstq + sub dst5q, dstq + %8 x,x,x,x,m7,x +.loop: + mov%3 m0, [srcq ] + mov%3 m1, [srcq+16] + mov%3 m2, [srcq+32] + mov%3 m3, [srcq+48] + mov%3 m4, [srcq+64] + mov%3 m5, [srcq+80] + + SBUTTERFLYPS 0, 3, 6 + SBUTTERFLYPS 1, 4, 6 + SBUTTERFLYPS 2, 5, 6 + SBUTTERFLYPS 0, 4, 6 + SBUTTERFLYPS 3, 2, 6 + SBUTTERFLYPS 1, 5, 6 + SWAP 1, 4 + SWAP 2, 3 + + %7 m0,m1,x,x,m7,m6 + %7 m2,m3,x,x,m7,m6 + %7 m4,m5,x,x,m7,m6 + + mov %+ %3 %+ ps [dstq ], m0 + mov %+ %3 %+ ps [dstq+dst1q], m1 + mov %+ %3 %+ ps [dstq+dst2q], m2 + mov %+ %3 %+ ps [dstq+dst3q], m3 + mov %+ %3 %+ ps [dstq+dst4q], m4 + mov %+ %3 %+ ps [dstq+dst5q], m5 + + add srcq, mmsize*6 + add dstq, mmsize + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32) + +%macro PACK_8CH 8 +cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7 + mov dstq, [dstq] +%if ARCH_X86_32 + DEFINE_ARGS dst, src, src2, src3, src4, src5, src6 + %define lend dword r2m + %define src1q r0q + %define src1m dword [rsp+32] +%if HAVE_ALIGNED_STACK == 0 + DEFINE_ARGS dst, src, src2, src3, src5, src6 + %define src4q r0q + %define src4m dword [rsp+36] +%endif + %define src7q r0q + %define src7m dword [rsp+40] + mov dstm, dstq +%endif + mov src7q, [srcq+7*gprsize] + mov src6q, [srcq+6*gprsize] +%if ARCH_X86_32 + mov src7m, src7q +%endif + mov src5q, [srcq+5*gprsize] + mov src4q, [srcq+4*gprsize] + mov src3q, [srcq+3*gprsize] +%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 + mov src4m, src4q +%endif + mov src2q, [srcq+2*gprsize] + mov src1q, [srcq+1*gprsize] + mov srcq, [srcq] +%ifidn %3, a +%if ARCH_X86_32 + test dstmp, mmsize-1 +%else + test dstq, mmsize-1 +%endif + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test srcq, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test src1q, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test src2q, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test src3q, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX +%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 + test src4m, mmsize-1 +%else + test src4q, mmsize-1 +%endif + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test src5q, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX + test src6q, mmsize-1 + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX +%if ARCH_X86_32 + test src7m, mmsize-1 +%else + test src7q, mmsize-1 +%endif + jne pack_8ch_%2_to_%1_u_int %+ SUFFIX +%else +pack_8ch_%2_to_%1_u_int %+ SUFFIX: +%endif + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq +%if ARCH_X86_64 || HAVE_ALIGNED_STACK + sub src4q, srcq +%else + sub src4m, srcq +%endif + sub src5q, srcq + sub src6q, srcq +%if ARCH_X86_64 + sub src7q, srcq +%else + mov src1m, src1q + sub src7m, srcq +%endif + +%if ARCH_X86_64 + %8 x,x,x,x,m9,x +%elifidn %1, int32 + %define m9 [flt2p31] +%else + %define m9 [flt2pm31] +%endif + +.loop: + mov%3 m0, [srcq ] + mov%3 m1, [srcq+src1q] + mov%3 m2, [srcq+src2q] +%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 + mov src4q, src4m +%endif + mov%3 m3, [srcq+src3q] + mov%3 m4, [srcq+src4q] + mov%3 m5, [srcq+src5q] +%if ARCH_X86_32 + mov src7q, src7m +%endif + mov%3 m6, [srcq+src6q] + mov%3 m7, [srcq+src7q] + +%if ARCH_X86_64 + TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 + + %7 m0,m1,x,x,m9,m8 + %7 m2,m3,x,x,m9,m8 + %7 m4,m5,x,x,m9,m8 + %7 m6,m7,x,x,m9,m8 + + mov%3 [dstq], m0 +%else + mov dstq, dstm + + TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1 + + %7 m0,m1,x,x,m9,m2 + mova m2, [rsp] + mov%3 [dstq], m0 + %7 m2,m3,x,x,m9,m0 + %7 m4,m5,x,x,m9,m0 + %7 m6,m7,x,x,m9,m0 + +%endif + + mov%3 [dstq+16], m1 + mov%3 [dstq+32], m2 + mov%3 [dstq+48], m3 + mov%3 [dstq+64], m4 + mov%3 [dstq+80], m5 + mov%3 [dstq+96], m6 + mov%3 [dstq+112], m7 + + add srcq, mmsize + add dstq, mmsize*8 +%if ARCH_X86_32 + mov dstm, dstq + mov src1q, src1m +%endif + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +%macro INT16_TO_INT32_N 6 + pxor m2, m2 + pxor m3, m3 + punpcklwd m2, m1 + punpckhwd m3, m1 + SWAP 4,0 + pxor m0, m0 + pxor m1, m1 + punpcklwd m0, m4 + punpckhwd m1, m4 +%endmacro + +%macro INT32_TO_INT16_N 6 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + packssdw m0, m1 + packssdw m2, m3 + SWAP 1,2 +%endmacro + +%macro INT32_TO_FLOAT_INIT 6 + mova %5, [flt2pm31] +%endmacro +%macro INT32_TO_FLOAT_N 6 + cvtdq2ps %1, %1 + cvtdq2ps %2, %2 + mulps %1, %1, %5 + mulps %2, %2, %5 +%endmacro + +%macro FLOAT_TO_INT32_INIT 6 + mova %5, [flt2p31] +%endmacro +%macro FLOAT_TO_INT32_N 6 + mulps %1, %5 + mulps %2, %5 + cvtps2dq %6, %1 + cmpps %1, %1, %5, 5 + paddd %1, %6 + cvtps2dq %6, %2 + cmpps %2, %2, %5, 5 + paddd %2, %6 +%endmacro + +%macro INT16_TO_FLOAT_INIT 6 + mova m5, [flt2pm31] +%endmacro +%macro INT16_TO_FLOAT_N 6 + INT16_TO_INT32_N %1,%2,%3,%4,%5,%6 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m0, m0, m5 + mulps m1, m1, m5 + mulps m2, m2, m5 + mulps m3, m3, m5 +%endmacro + +%macro FLOAT_TO_INT16_INIT 6 + mova m5, [flt2p15] +%endmacro +%macro FLOAT_TO_INT16_N 6 + mulps m0, m5 + mulps m1, m5 + mulps m2, m5 + mulps m3, m5 + cvtps2dq m0, m0 + cvtps2dq m1, m1 + packssdw m0, m1 + cvtps2dq m1, m2 + cvtps2dq m3, m3 + packssdw m1, m3 +%endmacro + +%macro NOP_N 0-6 +%endmacro + +INIT_MMX mmx +CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N +CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N +CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N +CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N + +PACK_6CH float, float, u, 2, 2, 0, NOP_N, NOP_N +PACK_6CH float, float, a, 2, 2, 0, NOP_N, NOP_N + +INIT_XMM sse +PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N +PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N + +UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N +UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N + +INIT_XMM sse2 +CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N +CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N +CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N +CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N + +PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N +PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N +PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N +PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N +PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N +PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N +PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N +PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N + +UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N +UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N +UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N +UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N +UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N +UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N +UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N +UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N + +CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT +CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT + +PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT +PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT + +UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT +UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT + +PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N +PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N + +PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +INIT_XMM ssse3 +UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N +UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N +UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N +UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N +UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT +UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N +PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N + +UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N +UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N + +PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N +PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N + +PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT + +INIT_YMM avx +CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +%endif + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/rematrix.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/rematrix.asm new file mode 100644 index 000000000..7984b9a72 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/rematrix.asm @@ -0,0 +1,250 @@ +;****************************************************************************** +;* Copyright (c) 2012 Michael Niedermayer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + + +SECTION_RODATA 32 +dw1: times 8 dd 1 +w1 : times 16 dw 1 + +SECTION .text + +%macro MIX2_FLT 1 +cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len +%ifidn %1, a + test in1q, mmsize-1 + jne mix_2_1_float_u_int %+ SUFFIX + test in2q, mmsize-1 + jne mix_2_1_float_u_int %+ SUFFIX + test outq, mmsize-1 + jne mix_2_1_float_u_int %+ SUFFIX +%else +mix_2_1_float_u_int %+ SUFFIX: +%endif + VBROADCASTSS m4, [coeffpq + 4*index1q] + VBROADCASTSS m5, [coeffpq + 4*index2q] + shl lend , 2 + add in1q , lenq + add in2q , lenq + add outq , lenq + neg lenq +.next: +%ifidn %1, a + mulps m0, m4, [in1q + lenq ] + mulps m1, m5, [in2q + lenq ] + mulps m2, m4, [in1q + lenq + mmsize] + mulps m3, m5, [in2q + lenq + mmsize] +%else + movu m0, [in1q + lenq ] + movu m1, [in2q + lenq ] + movu m2, [in1q + lenq + mmsize] + movu m3, [in2q + lenq + mmsize] + mulps m0, m0, m4 + mulps m1, m1, m5 + mulps m2, m2, m4 + mulps m3, m3, m5 +%endif + addps m0, m0, m1 + addps m2, m2, m3 + mov%1 [outq + lenq ], m0 + mov%1 [outq + lenq + mmsize], m2 + add lenq, mmsize*2 + jl .next + REP_RET +%endmacro + +%macro MIX1_FLT 1 +cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len +%ifidn %1, a + test inq, mmsize-1 + jne mix_1_1_float_u_int %+ SUFFIX + test outq, mmsize-1 + jne mix_1_1_float_u_int %+ SUFFIX +%else +mix_1_1_float_u_int %+ SUFFIX: +%endif + VBROADCASTSS m2, [coeffpq + 4*indexq] + shl lenq , 2 + add inq , lenq + add outq , lenq + neg lenq +.next: +%ifidn %1, a + mulps m0, m2, [inq + lenq ] + mulps m1, m2, [inq + lenq + mmsize] +%else + movu m0, [inq + lenq ] + movu m1, [inq + lenq + mmsize] + mulps m0, m0, m2 + mulps m1, m1, m2 +%endif + mov%1 [outq + lenq ], m0 + mov%1 [outq + lenq + mmsize], m1 + add lenq, mmsize*2 + jl .next + REP_RET +%endmacro + +%macro MIX1_INT16 1 +cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len +%ifidn %1, a + test inq, mmsize-1 + jne mix_1_1_int16_u_int %+ SUFFIX + test outq, mmsize-1 + jne mix_1_1_int16_u_int %+ SUFFIX +%else +mix_1_1_int16_u_int %+ SUFFIX: +%endif + movd m4, [coeffpq + 4*indexq] + SPLATW m5, m4 + psllq m4, 32 + psrlq m4, 48 + mova m0, [w1] + psllw m0, m4 + psrlw m0, 1 + punpcklwd m5, m0 + add lenq , lenq + add inq , lenq + add outq , lenq + neg lenq +.next: + mov%1 m0, [inq + lenq ] + mov%1 m2, [inq + lenq + mmsize] + mova m1, m0 + mova m3, m2 + punpcklwd m0, [w1] + punpckhwd m1, [w1] + punpcklwd m2, [w1] + punpckhwd m3, [w1] + pmaddwd m0, m5 + pmaddwd m1, m5 + pmaddwd m2, m5 + pmaddwd m3, m5 + psrad m0, m4 + psrad m1, m4 + psrad m2, m4 + psrad m3, m4 + packssdw m0, m1 + packssdw m2, m3 + mov%1 [outq + lenq ], m0 + mov%1 [outq + lenq + mmsize], m2 + add lenq, mmsize*2 + jl .next +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +%macro MIX2_INT16 1 +cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len +%ifidn %1, a + test in1q, mmsize-1 + jne mix_2_1_int16_u_int %+ SUFFIX + test in2q, mmsize-1 + jne mix_2_1_int16_u_int %+ SUFFIX + test outq, mmsize-1 + jne mix_2_1_int16_u_int %+ SUFFIX +%else +mix_2_1_int16_u_int %+ SUFFIX: +%endif + movd m4, [coeffpq + 4*index1q] + movd m6, [coeffpq + 4*index2q] + SPLATW m5, m4 + SPLATW m6, m6 + psllq m4, 32 + psrlq m4, 48 + mova m7, [dw1] + pslld m7, m4 + psrld m7, 1 + punpcklwd m5, m6 + add lend , lend + add in1q , lenq + add in2q , lenq + add outq , lenq + neg lenq +.next: + mov%1 m0, [in1q + lenq ] + mov%1 m2, [in2q + lenq ] + mova m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + + mov%1 m2, [in1q + lenq + mmsize] + mov%1 m6, [in2q + lenq + mmsize] + mova m3, m2 + punpcklwd m2, m6 + punpckhwd m3, m6 + + pmaddwd m0, m5 + pmaddwd m1, m5 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, m4 + psrad m1, m4 + psrad m2, m4 + psrad m3, m4 + packssdw m0, m1 + packssdw m2, m3 + mov%1 [outq + lenq ], m0 + mov%1 [outq + lenq + mmsize], m2 + add lenq, mmsize*2 + jl .next +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + + +INIT_MMX mmx +MIX1_INT16 u +MIX1_INT16 a +MIX2_INT16 u +MIX2_INT16 a + +INIT_XMM sse +MIX2_FLT u +MIX2_FLT a +MIX1_FLT u +MIX1_FLT a + +INIT_XMM sse2 +MIX1_INT16 u +MIX1_INT16 a +MIX2_INT16 u +MIX2_INT16 a + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +MIX2_FLT u +MIX2_FLT a +MIX1_FLT u +MIX1_FLT a +%endif diff --git a/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/resample.asm b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/resample.asm new file mode 100644 index 000000000..7107cf9d4 --- /dev/null +++ b/trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/resample.asm @@ -0,0 +1,619 @@ +;****************************************************************************** +;* Copyright (c) 2012 Michael Niedermayer +;* Copyright (c) 2014 James Almer gmail.com> +;* Copyright (c) 2014 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc ResampleContext + .av_class: pointer 1 + .filter_bank: pointer 1 + .filter_length: resd 1 + .filter_alloc: resd 1 + .ideal_dst_incr: resd 1 + .dst_incr: resd 1 + .dst_incr_div: resd 1 + .dst_incr_mod: resd 1 + .index: resd 1 + .frac: resd 1 + .src_incr: resd 1 + .compensation_distance: resd 1 + .phase_count: resd 1 + + ; there's a few more here but we only care about the first few +endstruc + +SECTION_RODATA + +pf_1: dd 1.0 +pdbl_1: dq 1.0 +pd_0x4000: dd 0x4000 + +SECTION .text + +; FIXME remove unneeded variables (index_incr, phase_mask) +%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant +; int resample_common_$format(ResampleContext *ctx, $format *dst, +; const $format *src, int size, int update_ctx) +%if ARCH_X86_64 ; unix64 and win64 +cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \ + dst_incr_mod, size, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + phase_mask, dst_end, filter_bank + + ; use red-zone for variable storage +%define ctx_stackq [rsp-0x8] +%define src_stackq [rsp-0x10] +%if WIN64 +%define update_context_stackd r4m +%else ; unix64 +%define update_context_stackd [rsp-0x14] +%endif + + ; load as many variables in registers as possible; for the rest, store + ; on stack so that we have 'ctx' available as one extra register + mov sized, r3d +%if UNIX64 + mov update_context_stackd, r4d +%endif + mov indexd, [ctxq+ResampleContext.index] + mov fracd, [ctxq+ResampleContext.frac] + mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + mov src_incrd, [ctxq+ResampleContext.src_incr] + mov ctx_stackq, ctxq + mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] + mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] + shl min_filter_len_x4d, %3 + lea dst_endq, [dstq+sizeq*%2] + +%if UNIX64 + mov ecx, [ctxq+ResampleContext.phase_count] + mov edi, [ctxq+ResampleContext.filter_alloc] + + DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \ + filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%elif WIN64 + mov R9d, [ctxq+ResampleContext.filter_alloc] + mov ecx, [ctxq+ResampleContext.phase_count] + + DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%endif + + neg min_filter_len_x4q + sub filter_bankq, min_filter_len_x4q + sub srcq, min_filter_len_x4q + mov src_stackq, srcq +%else ; x86-32 +cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \ + index, min_filter_length_x4, filter_bank + + ; push temp variables to stack +%define ctx_stackq r0mp +%define src_stackq r2mp +%define update_context_stackd r4m + + mov dstq, r1mp + mov r3, r3mp + lea r3, [dstq+r3*%2] + PUSH dword [ctxq+ResampleContext.dst_incr_div] + PUSH dword [ctxq+ResampleContext.dst_incr_mod] + PUSH dword [ctxq+ResampleContext.filter_alloc] + PUSH r3 + PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement for phase_mask + PUSH dword [ctxq+ResampleContext.src_incr] + mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] + mov indexd, [ctxq+ResampleContext.index] + shl min_filter_length_x4d, %3 + mov fracd, [ctxq+ResampleContext.frac] + neg min_filter_length_x4q + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + sub r2mp, min_filter_length_x4q + sub filter_bankq, min_filter_length_x4q + PUSH min_filter_length_x4q + PUSH filter_bankq + mov phase_countd, [ctxq+ResampleContext.phase_count] + + DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter + +%define filter_bankq dword [rsp+0x0] +%define min_filter_length_x4q dword [rsp+0x4] +%define src_incrd dword [rsp+0x8] +%define phase_maskd dword [rsp+0xc] +%define dst_endq dword [rsp+0x10] +%define filter_allocd dword [rsp+0x14] +%define dst_incr_modd dword [rsp+0x18] +%define dst_incr_divd dword [rsp+0x1c] + + mov srcq, r2mp +%endif + +.loop: + mov filterd, filter_allocd + imul filterd, indexd +%if ARCH_X86_64 + mov min_filter_count_x4q, min_filter_len_x4q + lea filterq, [filter_bankq+filterq*%2] +%else ; x86-32 + mov min_filter_count_x4q, filter_bankq + lea filterq, [min_filter_count_x4q+filterq*%2] + mov min_filter_count_x4q, min_filter_length_x4q +%endif +%ifidn %1, int16 + movd m0, [pd_0x4000] +%else ; float/double + xorps m0, m0, m0 +%endif + + align 16 +.inner_loop: + movu m1, [srcq+min_filter_count_x4q*1] +%ifidn %1, int16 +%if cpuflag(xop) + vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0 +%else + pmaddwd m1, [filterq+min_filter_count_x4q*1] + paddd m0, m1 +%endif +%else ; float/double +%if cpuflag(fma4) || cpuflag(fma3) + fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 +%else + mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] + addp%4 m0, m0, m1 +%endif ; cpuflag +%endif + add min_filter_count_x4q, mmsize + js .inner_loop + +%ifidn %1, int16 + HADDD m0, m1 + psrad m0, 15 + add fracd, dst_incr_modd + packssdw m0, m0 + add indexd, dst_incr_divd + movd [dstq], m0 +%else ; float/double + ; horizontal sum & store +%if mmsize == 32 + vextractf128 xm1, m0, 0x1 + addp%4 xm0, xm1 +%endif + movhlps xm1, xm0 +%ifidn %1, float + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 +%endif + add fracd, dst_incr_modd + addp%4 xm0, xm1 + add indexd, dst_incr_divd + movs%4 [dstq], xm0 +%endif + cmp fracd, src_incrd + jl .skip + sub fracd, src_incrd + inc indexd + +%if UNIX64 + DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%else ; x86-32 + DEFINE_ARGS src, phase_count, dst, frac, index, index_incr +%endif + +.skip: + add dstq, %2 + cmp indexd, phase_countd + jb .index_skip +.index_while: + sub indexd, phase_countd + lea srcq, [srcq+%2] + cmp indexd, phase_countd + jnb .index_while +.index_skip: + cmp dstq, dst_endq + jne .loop + +%if ARCH_X86_64 + DEFINE_ARGS ctx, dst, src, phase_count, index, frac +%else ; x86-32 + DEFINE_ARGS src, ctx, update_context, frac, index +%endif + + cmp dword update_context_stackd, 0 + jz .skip_store + ; strictly speaking, the function should always return the consumed + ; number of bytes; however, we only use the value if update_context + ; is true, so let's just leave it uninitialized otherwise + mov ctxq, ctx_stackq + movifnidn rax, srcq + mov [ctxq+ResampleContext.frac ], fracd + sub rax, src_stackq + mov [ctxq+ResampleContext.index], indexd + shr rax, %3 + +.skip_store: +%if ARCH_X86_32 + ADD rsp, 0x20 +%endif + RET + +; int resample_linear_$format(ResampleContext *ctx, float *dst, +; const float *src, int size, int update_ctx) +%if ARCH_X86_64 ; unix64 and win64 +%if UNIX64 +cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \ + size, dst_incr_mod, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + src, dst_end, filter_bank + + mov srcq, r2mp +%else ; win64 +cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \ + size, dst_incr_mod, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + dst, dst_end, filter_bank + + mov dstq, r1mp +%endif + + ; use red-zone for variable storage +%define ctx_stackq [rsp-0x8] +%define src_stackq [rsp-0x10] +%define phase_mask_stackd [rsp-0x14] +%if WIN64 +%define update_context_stackd r4m +%else ; unix64 +%define update_context_stackd [rsp-0x18] +%endif + + ; load as many variables in registers as possible; for the rest, store + ; on stack so that we have 'ctx' available as one extra register + mov sized, r3d +%if UNIX64 + mov update_context_stackd, r4d +%endif + mov indexd, [ctxq+ResampleContext.index] + mov fracd, [ctxq+ResampleContext.frac] + mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + mov src_incrd, [ctxq+ResampleContext.src_incr] + mov ctx_stackq, ctxq + mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] +%ifidn %1, int16 + movd m4, [pd_0x4000] +%else ; float/double + cvtsi2s%4 xm0, src_incrd + movs%4 xm4, [%5] + divs%4 xm4, xm0 +%endif + mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] + shl min_filter_len_x4d, %3 + lea dst_endq, [dstq+sizeq*%2] + +%if UNIX64 + mov ecx, [ctxq+ResampleContext.phase_count] + mov edi, [ctxq+ResampleContext.filter_alloc] + + DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank +%elif WIN64 + mov R9d, [ctxq+ResampleContext.filter_alloc] + mov ecx, [ctxq+ResampleContext.phase_count] + + DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank +%endif + + neg min_filter_len_x4q + sub filter_bankq, min_filter_len_x4q + sub srcq, min_filter_len_x4q + mov src_stackq, srcq +%else ; x86-32 +cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ + frac, index, dst, filter_bank + + ; push temp variables to stack +%define ctx_stackq r0mp +%define src_stackq r2mp +%define update_context_stackd r4m + + mov dstq, r1mp + mov r3, r3mp + lea r3, [dstq+r3*%2] + PUSH dword [ctxq+ResampleContext.dst_incr_div] + PUSH r3 + mov r3, dword [ctxq+ResampleContext.filter_alloc] + PUSH dword [ctxq+ResampleContext.dst_incr_mod] + PUSH r3 + shl r3, %3 + PUSH r3 + mov r3, dword [ctxq+ResampleContext.src_incr] + PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement of phase_mask + PUSH r3d +%ifidn %1, int16 + movd m4, [pd_0x4000] +%else ; float/double + cvtsi2s%4 xm0, r3d + movs%4 xm4, [%5] + divs%4 xm4, xm0 +%endif + mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] + mov indexd, [ctxq+ResampleContext.index] + shl min_filter_length_x4d, %3 + mov fracd, [ctxq+ResampleContext.frac] + neg min_filter_length_x4q + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + sub r2mp, min_filter_length_x4q + sub filter_bankq, min_filter_length_x4q + PUSH min_filter_length_x4q + PUSH filter_bankq + PUSH dword [ctxq+ResampleContext.phase_count] + + DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src + +%define phase_count_stackd dword [rsp+0x0] +%define filter_bankq dword [rsp+0x4] +%define min_filter_length_x4q dword [rsp+0x8] +%define src_incrd dword [rsp+0xc] +%define phase_mask_stackd dword [rsp+0x10] +%define filter_alloc_x4q dword [rsp+0x14] +%define filter_allocd dword [rsp+0x18] +%define dst_incr_modd dword [rsp+0x1c] +%define dst_endq dword [rsp+0x20] +%define dst_incr_divd dword [rsp+0x24] + + mov srcq, r2mp +%endif + +.loop: + mov filter1d, filter_allocd + imul filter1d, indexd +%if ARCH_X86_64 + mov min_filter_count_x4q, min_filter_len_x4q + lea filter1q, [filter_bankq+filter1q*%2] + lea filter2q, [filter1q+filter_allocq*%2] +%else ; x86-32 + mov min_filter_count_x4q, filter_bankq + lea filter1q, [min_filter_count_x4q+filter1q*%2] + mov min_filter_count_x4q, min_filter_length_x4q + mov filter2q, filter1q + add filter2q, filter_alloc_x4q +%endif +%ifidn %1, int16 + mova m0, m4 + mova m2, m4 +%else ; float/double + xorps m0, m0, m0 + xorps m2, m2, m2 +%endif + + align 16 +.inner_loop: + movu m1, [srcq+min_filter_count_x4q*1] +%ifidn %1, int16 +%if cpuflag(xop) + vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2 + vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0 +%else + pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] + pmaddwd m1, [filter1q+min_filter_count_x4q*1] + paddd m2, m3 + paddd m0, m1 +%endif ; cpuflag +%else ; float/double +%if cpuflag(fma4) || cpuflag(fma3) + fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 + fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0 +%else + mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] + mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] + addp%4 m2, m2, m3 + addp%4 m0, m0, m1 +%endif ; cpuflag +%endif + add min_filter_count_x4q, mmsize + js .inner_loop + +%ifidn %1, int16 +%if mmsize == 16 +%if cpuflag(xop) + vphadddq m2, m2 + vphadddq m0, m0 +%endif + pshufd m3, m2, q0032 + pshufd m1, m0, q0032 + paddd m2, m3 + paddd m0, m1 +%endif +%if notcpuflag(xop) + PSHUFLW m3, m2, q0032 + PSHUFLW m1, m0, q0032 + paddd m2, m3 + paddd m0, m1 +%endif + psubd m2, m0 + ; This is probably a really bad idea on atom and other machines with a + ; long transfer latency between GPRs and XMMs (atom). However, it does + ; make the clip a lot simpler... + movd eax, m2 + add indexd, dst_incr_divd + imul fracd + idiv src_incrd + movd m1, eax + add fracd, dst_incr_modd + paddd m0, m1 + psrad m0, 15 + packssdw m0, m0 + movd [dstq], m0 + + ; note that for imul/idiv, I need to move filter to edx/eax for each: + ; - 32bit: eax=r0[filter1], edx=r2[filter2] + ; - win64: eax=r6[filter1], edx=r1[todo] + ; - unix64: eax=r6[filter1], edx=r2[todo] +%else ; float/double + ; val += (v2 - val) * (FELEML) frac / c->src_incr; +%if mmsize == 32 + vextractf128 xm1, m0, 0x1 + vextractf128 xm3, m2, 0x1 + addp%4 xm0, xm1 + addp%4 xm2, xm3 +%endif + cvtsi2s%4 xm1, fracd + subp%4 xm2, xm0 + mulp%4 xm1, xm4 + shufp%4 xm1, xm1, q0000 +%if cpuflag(fma4) || cpuflag(fma3) + fmaddp%4 xm0, xm2, xm1, xm0 +%else + mulp%4 xm2, xm1 + addp%4 xm0, xm2 +%endif ; cpuflag + + ; horizontal sum & store + movhlps xm1, xm0 +%ifidn %1, float + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 +%endif + add fracd, dst_incr_modd + addp%4 xm0, xm1 + add indexd, dst_incr_divd + movs%4 [dstq], xm0 +%endif + cmp fracd, src_incrd + jl .skip + sub fracd, src_incrd + inc indexd + +%if UNIX64 + DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank +%else ; x86-32 + DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src +%endif + +.skip: +%if ARCH_X86_32 + mov phase_countd, phase_count_stackd +%endif + add dstq, %2 + cmp indexd, phase_countd + jb .index_skip +.index_while: + sub indexd, phase_countd + lea srcq, [srcq+%2] + cmp indexd, phase_countd + jnb .index_while +.index_skip: + cmp dstq, dst_endq + jne .loop + +%if UNIX64 + DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank +%else ; x86-32 + DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src +%endif + + cmp dword update_context_stackd, 0 + jz .skip_store + ; strictly speaking, the function should always return the consumed + ; number of bytes; however, we only use the value if update_context + ; is true, so let's just leave it uninitialized otherwise + mov ctxq, ctx_stackq + movifnidn rax, srcq + mov [ctxq+ResampleContext.frac ], fracd + sub rax, src_stackq + mov [ctxq+ResampleContext.index], indexd + shr rax, %3 + +.skip_store: +%if ARCH_X86_32 + ADD rsp, 0x28 +%endif + RET +%endmacro + +INIT_XMM sse +RESAMPLE_FNS float, 4, 2, s, pf_1 + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +RESAMPLE_FNS float, 4, 2, s, pf_1 +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +RESAMPLE_FNS float, 4, 2, s, pf_1 +%endif +%if HAVE_FMA4_EXTERNAL +INIT_XMM fma4 +RESAMPLE_FNS float, 4, 2, s, pf_1 +%endif + +%if ARCH_X86_32 +INIT_MMX mmxext +RESAMPLE_FNS int16, 2, 1 +%endif + +INIT_XMM sse2 +RESAMPLE_FNS int16, 2, 1 +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +RESAMPLE_FNS int16, 2, 1 +%endif + +INIT_XMM sse2 +RESAMPLE_FNS double, 8, 3, d, pdbl_1 + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +RESAMPLE_FNS double, 8, 3, d, pdbl_1 +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +RESAMPLE_FNS double, 8, 3, d, pdbl_1 +%endif diff --git a/trunk/auto/depends.sh b/trunk/auto/depends.sh index 29834e44f..e66a257ca 100755 --- a/trunk/auto/depends.sh +++ b/trunk/auto/depends.sh @@ -429,48 +429,14 @@ if [ $SRS_EXPORT_LIBRTMP_PROJECT = NO ]; then ln -sf ../3rdparty/ffmpeg-4.2-fit && cd ffmpeg-4.2-fit && PKG_CONFIG_PATH=$ABS_OBJS/opus/lib/pkgconfig ./configure \ --prefix=`pwd`/_release \ - --pkg-config-flags="--static" \ - --extra-libs=-lpthread \ - --extra-libs=-lm \ - --disable-programs \ - --disable-doc \ - --disable-htmlpages \ - --disable-manpages \ - --disable-podpages \ - --disable-txtpages \ - --disable-avdevice \ - --disable-avformat \ - --disable-swscale \ - --disable-postproc \ - --disable-avfilter \ - --disable-network \ - --disable-dct \ - --disable-dwt \ - --disable-error-resilience \ - --disable-lsp \ - --disable-lzo \ - --disable-faan \ - --disable-pixelutils \ - --disable-hwaccels \ - --disable-devices \ - --disable-audiotoolbox \ - --disable-videotoolbox \ - --disable-appkit \ - --disable-coreimage \ - --disable-avfoundation \ - --disable-securetransport \ - --disable-iconv \ - --disable-lzma \ - --disable-sdl2 \ - --disable-everything \ - --enable-decoder=aac \ - --enable-decoder=aac_fixed \ - --enable-decoder=aac_latm \ - --enable-decoder=libopus \ - --enable-encoder=aac \ - --enable-encoder=opus \ - --enable-encoder=libopus \ - --enable-libopus && + --pkg-config-flags="--static" --extra-libs=-lpthread --extra-libs=-lm \ + --disable-programs --disable-doc --disable-htmlpages --disable-manpages --disable-podpages --disable-txtpages \ + --disable-avdevice --disable-avformat --disable-swscale --disable-postproc --disable-avfilter --disable-network \ + --disable-dct --disable-dwt --disable-error-resilience --disable-lsp --disable-lzo --disable-faan --disable-pixelutils \ + --disable-hwaccels --disable-devices --disable-audiotoolbox --disable-videotoolbox --disable-appkit --disable-coreimage \ + --disable-avfoundation --disable-securetransport --disable-iconv --disable-lzma --disable-sdl2 --disable-everything \ + --enable-decoder=aac --enable-decoder=aac_fixed --enable-decoder=aac_latm --enable-decoder=libopus --enable-encoder=aac \ + --enable-encoder=opus --enable-encoder=libopus --enable-libopus && make ${SRS_JOBS} && make install cd .. && rm -rf ffmpeg && ln -sf ffmpeg-4.2-fit/_release ffmpeg ) diff --git a/trunk/configure b/trunk/configure index 4e710d183..f7efe2d58 100755 --- a/trunk/configure +++ b/trunk/configure @@ -151,7 +151,8 @@ if [[ $SRS_SHARED_ST == YES ]]; then LibSTfile="-lst"; fi # srtp LibSrtpRoot="${SRS_OBJS_DIR}/srtp2/include"; LibSrtpFile="${SRS_OBJS_DIR}/srtp2/lib/libsrtp2.a" # ffmpeg -LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a ${SRS_OBJS_DIR}/ffmpeg/lib/libopus.a -lpthread" +LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a -lpthread" +LibFfmpegRoot="${LibFfmpegRoot} ${SRS_OBJS_DIR}/opus/include"; LibFfmpegFile="${LibFfmpegFile} ${SRS_OBJS_DIR}/opus/lib/libopus.a" # openssl-1.1.0e, for the RTMP complex handshake. LibSSLRoot="";LibSSLfile="" if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == NO ]]; then @@ -173,7 +174,7 @@ fi # the link options, always use static link SrsLinkOptions="-ldl"; if [[ $SRS_SRT == YES ]]; then - SrsLinkOptions="${SrsLinkOptions} -pthread"; + SrsLinkOptions="${SrsLinkOptions} -lpthread"; fi if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == YES ]]; then SrsLinkOptions="${SrsLinkOptions} -lssl -lcrypto";