From 4278f8c3d07aa89f02697ed6e6bab00728160a4c Mon Sep 17 00:00:00 2001 From: Addison Date: Mon, 24 Mar 2025 10:58:54 +0800 Subject: [PATCH] Restructure for GitBook --- .gitbook/assets/image.png | Bin 0 -> 13271 bytes README.md | 203 +++++++++++++++++++++++++- SUMMARY.md | 5 + lesson-three.md | 201 ++++++++++++++++++++++++++ lesson_02/index.md => lesson-two.md | 60 ++++---- lesson_01/index.md | 213 ---------------------------- lesson_03/image1.png | Bin 11246 -> 0 bytes lesson_03/index.md | 202 -------------------------- 8 files changed, 435 insertions(+), 449 deletions(-) create mode 100644 .gitbook/assets/image.png create mode 100644 SUMMARY.md create mode 100644 lesson-three.md rename lesson_02/index.md => lesson-two.md (63%) delete mode 100644 lesson_01/index.md delete mode 100644 lesson_03/image1.png delete mode 100644 lesson_03/index.md diff --git a/.gitbook/assets/image.png b/.gitbook/assets/image.png new file mode 100644 index 0000000000000000000000000000000000000000..2ecdf7e1815f56dd8247b23901d673e31d1d2a6b GIT binary patch literal 13271 zcmbWeWmp`+5(SF8Ebg$21ozTCNAfKMT+xL(!Xcu)E zaj2Rp(j!Pm(p*Q*LP-gV0n$c-f)2NWg8M51Ifx+#6clVeG!!i48~X3Le3<`Tg`UiZ z{a+iZ@vmSW^qDW}7re#QJ2-K;Ku4^UN@y1h)d-X_I%tJJS*0@W zE3?cy1qGNS)v|X!TZn_^RqFHbp^KR-6ewcyw%3op;+e9JRm4HE4Gg{N@DI?u74hdR303<4?a2D{l6F5UDDoCQ|Bgg+wa-d-W;jaJyi5X^~paV!n|BHBj18_$5y}7)a#B+L@NAnANDJe3vc)kl{+I~DAkco4t8E*q?8zNH z@5Z*?kA0WRrJpmn-^aLuTiDdZ0dmOQJ-$S-rOVXL)nXmQQOE78VaC70YNlN_=hlu= zf!x=P5^71YoH504_NlK8*Wyo$G|fVqgW2!(uy5j~rkiS)&ibBqeRSzfJeeL#wwva^ z#+klmXBv377|PdR+KO&29JCJBn#+TMni)W9d=>@nEQ&#hcF@mQWg;IdyP9e?{@5lO zr@9lL*CyoU0i9W*Gcs*2A&NG2BbZ07rWW#zzf!?+f1*n^7Vja4Gj2eEiD+&qlrIy> zPRPv8&V&-Es3;_S&uC$?0~vJtK@QF74{;Pq)Ve7if$S6vekSh{v(OA;n?)(jSoym8 z93JF3@xnvjuSo7!<83)U8sJBAmlno#uKotQ;z`MzNM<%YNXrNmuf)w$d77EKX~^Re za)ayG^m(i3GIHKcL`PK&G7CFIeOnY@yHD+-zaWVy$3{IFE!4dD)*TXn1905`T`owv zA2)`RYUQ0S)<{3$G#;jEQM%ZAjmYW2|^@ZG4zNK13TP60!+L8ekz@V zSeWP`R{kRbIbrv;-+X3LPH9)mHkS2OiA_4ovcm@3S6PZpkT_`}!VI`3(iOS6%p^J? zP9qHiVPqTH7hf=q_qi7?Md2mUUn?}oz~_AQ){^Dp9a13)c#9P(i$yEdPP&4(~yLX}@f_2^N)!%@3O0K&^mP=R-8 z=EE-W@YYDedDD82mPbV20Vmk`qB7XKJ@tnbd8fK$z%PI8J0xHdGNCmzx=5z+m=m(kxMAgY|LwXN&7Qe`3 z-%j|hr`;Hl5meooD7PD1>_H&LMzek=++!p)9U(MRU^*a{Tr7P`A%Z#$kNPvwk4(Y} zT7tQYcnYowI?8eUVlZU}U2v(M9kz$mf*{F>v(2kedp~FR_PN{7N@w1X_q8e?lH1L$ zZGxtk1uV?uh~-np871UdOp}_sR`q=VAchJnnpiQ(dm^cnegnWet;Kqpxu6y+v13cHltoGMf*0Q$(Jc{8T3IiF`hvg>nCTB zBX^0w%5+*1xxZ^GB6RUPun*r^Xf*omRrlnPxYr!hIj>kqz^7j*tc-<*+c-&0y0b;b zdH~O-h`EnRAHLS}Zw}A*^m4IkmXHG2`4U*GRt9>PxLQwkrH~gZLyh(`EP3Q)?l#xH zHl}t10g%Pm9yvCktZ2{0LP`7jV#^zAc{xXCf#*%ODnNyRH!1@~6%g8xcO(4h&tSa# zC3}j7)PhzE!JY_#J&C9J`2LnVVxM&+94^t^Xq0T9)oPuS3N%WR`hnO{Bik!PS)aI; zv0Hqlm(-9R4JIZ8DJ&2nau66iate62@1t<5Drj^h)V~VtIR>qK>Op%uqcHvX%wxI3 z&p>xc!Us;fp_<9J63@A@I(zaYU2{q|YxI68LQj#AVLFDOmY4)X(K(i_Hj@GLyPY0q zf;Rob1VdT(XRNu*W9B0T{NYHLJ&yEJ3J>4!e922zg3;d%-cNp_3SEr?Ca!h#m-9Xp zbg_YXL&-h^4WdfjArl0}e&;15=Fm)3m3(r_P#KZ6&_%7*!R3&egTBiaZ!JM7h@E(d zx;HCI?RVL9Y*fxdli{&)crn^z`dn~E$4#w?3A)Quetz$1=BwBGg&+Lv*fn@=Jvq)8 zVwt=9ZS|l;0w=yo?FF-=^OYI((=xQXN-Ox&>IuD#xy7uw`GMXoNsoB#REo}+;!|_W zq<^&_n|%=;vRyrI4RKc7=MRJJipgE`9{32K{kVB2Vp+foAj@-Ox8gOvbXGjnskaZv zr1-;H2qx0?e=0-j55D3vWu@n@III!BcdtHH6KM`uuBQ7%XT1) zQ%v{HD|S@|p&E_CIj~fiYgf|DEz)+_sxP=}L&U4~)Zc@z@)Y@6lIVNOM<-%i<*oXF z{(eTJ3qsr&L;FY~UC%wvaT>%C=;byYR1B?z*C;hsf=l1FAOzteEVAK-Q&J->(^SIV zGUxBA^AZ~6^AI)pLz5+-sf}dth1s2vnuW_~WU z6Q$2O&T5y}SG~qUntyIm^g;t~T_gP7Q45ya7Go>^i7;Rus#D*|+g-I5xJmSh?d)sV0eSSPnVRXJp0 zcNLGIxQ=N=F^nCI+nRp6ZR|EplO}aka{6=GyR>>@S}D~W|1bdB+HGfpw<{yw^_2>N zpAK=ZBzc6D2y=r}OOTwL9!25AGHGdv4!|rmL>5k87qet)Gw8iKTaRhh;2aC$#=PuW zo!?FD;;_2hpC$%htengNWsP0XiXwJA9Z|Ec>0+6kZskdp8(rbMI? z6+GD<11&=QSiS@h68M2C5>dn;juAsTz+El(~K>wU0 z8cf?B^l0ua6ec2!_r^T>zd%$P>E=*ig>Hp6#$yRK9X|f6j_!G4lKtOb=LF4s+X5O5 z`q~b8Y*Jm396}Z7|8w;)RBX?!$99a#FvB=zl3qNm`7@HXD178VnF9mJ7MpA+R+hW_DPn?c}KbC3*(A_S32)f=M z`oGV8a=xh(-uWN~%tfbqNOAoHytphiGVNO`W?c5W8_94~>WM)sA}!xf+#}e|j*Z0l zwWhtRE5nVhK1PIOT!)GlU2r@>tvwe4no^BwWD6Hv$!Snc_gMOvqjsts)U#i?Z0P;0 za~qgEdXF)#Kt^I{Qme;&g7kJu5`s32+0;xhSUt+hbtWRh zxBDGM4_5ABcE~<(lmyTWW&kvbsX9h^L6SQLAbejKR0)o)@AJwOEG>Ja@8d>D-91Y` zf+XT3;gqhAWKLZJ=A|v7ov8*>HD-!b_h+eNiBY~2xU)#h997gJ ziFV3>RXv(TbVBmS$-Za%ll5-~*Ci8cSb;CGmO0zmQjw9C>)M{C97w!=znfACFHX9lCaFjiRk(7i|q9T#=VRO_fZ|pLfw^(+@LNdVgQ%ktW@s z9r!zd{M2tL_;zgPq0fV^rON_~?PMOR#ZB-I2$0 zP2Q)~o6l$+UOP9Tdlbz-TF7_cDQ}g4P)VI``wBS7r_pOyvCc_pln_rN)^~mWLxklT z4e8GQN@Yo2t(6AYDiOjfi7qT!6l;3!gxQ17rXyhEpeyg+NwFn9<)Rw z8W=}CP{+jh_G^2ZJjY4lJgdXHhqp&H!byYurX2n4qDD+Jf(7(1b%^zVt8;5EYD?sf zihYP`R1M%X@{3*QO|A;93X+Fd1Fj-2JtaVuCu-8qV?pS*KBc&&ItMP&`|OrOVqy}M zQ6G*-%SXp=8dKs><9>$a_7L52G#Pj14}uS_lco6%B@_fnx|&7#)&rsJaBUEy$dsgC zB2nMig|ZQYj-b$g>0@6}t~E+9Dk~_6un&l$7nUMf+x-1g z@M$XWe58W7mp-VsMyh)~+TgrS?X+)c>*m}yS!cXAU2InJ$%eyydSG``zqJMg&h&5P zLP!iksnD$%+X2k!T$N=CE;aecsOmmNEOgK#(~S9#5ATWgsykyP7bTU5g3SkkSw`*3QFDzK8Aa(cD>rB!2H8BX}=A<#n@BVM5G(29D^0T=a;fduT z32M*R&{MA<{gbcqZf9abMU(u|F=}W>Q(!xEn7A{_P!xcjh-A5l6<@8qKu>8F~Qn2b|$ zm|CSVt+y#wz3|2u+DbIy)k|>%8tH(UKD+fpqwspDX{P#qZ=U`!VFZv}OU!ek^N)`> z&Ry(t;j<7Sf%e3&Dp^4~H8+jbL1Rd&R)?!|yh|o-c`FTKBTuetjX*W4R*}LVtjP(HvCv?-A_ZB{DYZ36T;n4k*v#qLm0ep6Msh%E z3OvN;Km6*C7iJ^X?~!eH`etF%@L``jH9M1efs9M`xIi?*fmi|Mdy&Y5j3XpcwXkaz zk84h?222Q`f{G>IOZ7wxcj1sUoNtHAVklPNrj_d&}5g zjCw3N2hC;H`X#0|r4~D#8y}mCw0ge8Eq;+xcxmdc!(1n?z^v|NE{vAe#(mE%Rvr*F zm^Lg4v^v}Is;)DxN#LKA+6=#jy;J=Y?Q>ELV{+R^^h)*Uv9{oN^F!i9;|AfJ!1P*CRPd znW=@T76Mo0x<2t43O6NnvV3gy#w`^ycXNvJEHTfdiD+(5x8;PcAip(3RrJt~tp{*f zS89Bt+d=L(UbgWU!58@QzJ53tSf1>#jOj+>@F^AOf4fn331ss2)m@Q6h$VIl?)i{M z>{Q5}MZ4~ld6Cb$MfP3tPQ+G;HOD5GZjWmk005j8-1byMp34uMBc_&qcyMOsJ?vQ z*Bkv$HmVpRbUW&3;GlUA2MV27)Hm;(j)6YD=#M(whoyZ2<6_~8Sx_QfaSTxr_qs)4 zf78Gg3z(k=C*bp%=2~3xovVBLgm0@1t27;b9S6}zKxao}J zkRHPwD%MnkwbbtR926}mSZtKwhfdpdG2w@i0&lWz6KK*4ffE@yETq=EgaOw4^+qf8 zqVFpSQc(V-hKq*+F1uWB^R> z;^86^8FP&E%Vv@b{6Gvn!*#xSOKEkMy1Vu$=3;>@qYw?!5v^-;15C(PNu#wGob+>| zCT=~9gzXL9Q6r$0TM6^~HOvdcrB%>OmWI(O)nJpOH?HZlS#+5c`$igV4wUaXcwXJV zt=!grSfRtG9ouMG!d65$#K)`bS~JEb98i<{v8~&apLwSLuEr$3It9LsNm9T8tQ0HgwAFqir^AK+m6% zp}X{Zwi!d#6e@+t)#FgskEy^|O&O^IcbzOm|I882A=8~KJRRIi%ig1R=vQ)Dr5t;* zD9ICQtCrO;$G#@iy#iVR09X+5xl76}|2^xVDAI|%hi0S^J%V_O^`+!!DK0EB8}+IN zbb0+%&FlS}@Y}15$kWt_=GS|;AF(92tc%VX32Ywp2-Cqye4iHZ#xOW)Jpy4?NHWZ; z+sX2D8=uO48syZQYFIUSVjji$EOusrY=uYp%s{GY!5+eU<$vN2#&9+t-kW%!h?i4- zk~OA~y|zpbLqA=uu0*VsyZ%Ytzrht^6Sg`sxL>@7QW9b4P~(Ms|JvXoXG>9tQi%Q# zW$LR<3qeA&BtjY+hpa3=WlJf;p!SqO*i)-Ny**^P7Y;+xEmvD4$`oX;$l?Op z)R+kINe1~p>Z?zKk&N~4Re3G#=p8&fm(nN`5@e+UAQ9C8Uvj`;oFcdD%Z#v+ zlWMVVpKKB$jAP#Ip$N6xNZARq_O!oq1|%9KYXgTt=@AI;oJrSPq{e=MlskSvT{>NO z1rXk3zy_AF@C;&{3iy0XFJmE4A#HVjO_}iumO!Hc!TEroew{W|ruzul{kj;oh*{nz zJo1BGX<)|DzMUgb(3WpQr+^_mU2E)ojqB~?Ox_JgKO_>CqWcvW(^GOLgLrpTljTM> z2Q%$gFli-*O1I$YY7osL_(*BUWj;^t1FX?7vXz&_OQW;5kAMhE=H_>(qhdn(Tf)uPJ#sVXcmCkd!aAzd8geGH zJ~yNAq@_`BtVS;fkPKf2J=P#Ykl{iYtza(VC(6Ur=8+kzWJVsm`CC-vZ|lxvXiUN) zB}iIM8sQG*o%U@ZWxx+^2JVfrCMOiS-TY`FuF>u8+aAHiyMFt@*8T6dXVC4pQ!QO9 z2MEh|d$QP*Q~}fE)&l}$OD=C-Cx|azn~m>XtL{HEfNEc>kyGutQ*patFKoxN+H2#( z<-DJcvkz9!p zEQo2V6@c--EDhdLZb2`d)R%$s<(ja3fWRG&(NSM=W50Gsoe5rDe6fe41JS||J_e$D z|GCjdrEqic|HXIROix4#Q>X95%NN$*<#vt`4%Kq|jEQ#7S!nuUih76}t|O-bxLW%C zG(0rxy-5V~s&OUN%_47D_reBMj^$#|jMOtIf0OPmo#+q6A=@&U&}ZO z+v1buOGaK8g@~vyOD*DTpW4@nXuNR0*M}T#Ke1WYerl-P!_o)He;biV!(uUtEp`PqB8xTx)N7BOtS zz9iA38HalcN@@ra#5qs|$0qok*L5s_n;RRTbf0|D!lAOODIz%G>Y5SV81df(g}J5- z_XcZGZe}8p@*D6ri^sjuE&bXrgx8njBUoOqbp-4cMk6il0>QyZy}0;Rb*MI${`d2U zRqe$Oj?lCH%Dw#Bo6c2v)1C8o%aJM)^80XbQLk4fii!j+sN%B>Y&@i>qo*7k5tcbK z#-s(o5QkC~e8+=V{GvM`1h?YMqZrds~&CG-sHCVe&pMun#k$L!-34O|$Z#JY(04?s9EV0N# zG^+O5@v_Zr5IZXPj)jIccad@Nxvi9?(yNG)H}=UX5vRzsr{0^vliV>`J^AJp>YI>>~Sle_aHO$KvW4=zpa9MT6wf{K~ zv_^-&!90ncaA(=eMtrKJ7q@>{0#F;<8L!?XLpZCxTu5KRUHKs{%7h=FyZSbhEBU%`(i4+u5Uou%{6W*0D&iCKqaAO+hGC3^{^y%)@_}!iK#}HJ~_k zPMQ1L-tMuQV~}%*!~|^|d_#4KKf-9N_~Rq_70EN6W=#V~7k(fVla9e&n8(AXp?SSdwCS-Vqt)u6_dD=*C`4gSoh zC8_0DszyAojmn5&)brXS^iB_f@;k?&0D4TOuL`OsGaA!B@WRq&&1WM&#xj~{nKkHZUDQt4`mIoqBqja~Q(dsa=iFN1VJ z833=n=4&8UzXB%twMeT(xT`?Ti)DrxL9fJCcbI*Kzqvn`>QK~SJYgz0xzGVm&F6H$ zDDewn$uPIeaq%KC#el$wrs(2M63WQ6QG9}Q;&JoGBWstz?T^SThp4FxtavhRm7He= z6UlCom;3Mj9Gtnb%|eW!SM^xXXmjG3V>TH|@Oeyb;g=Hi>c0=$K=of<3||P2Kfxxj zqJ8>6Z0e4I)&fFSBvM=$RdoI;kHLu~K{cYCj>2I=+<#kuy8I(as8TBUn%EbhC`Tlm zUz&%R`Ta9592*Z*C4%FumqgF+KIH!AvT<--%QH{yaxBr0dv}q}ymioi4t*#9#zYM2 zBOk=Q7gw>5#kS|*U0xCzp7{#f8%S4dC{iOf4+#e@Q>DI}qj@C@wYz_K zY3}+&33F!#(^@;$AwzwNuj1cH9%pVJ_UlahXX4?p#&T8T~VrnD_fqf7V z*70XGk`{lb95S}L^Tw>^AS%m0Q|zX~DeU4iQo&p?Nfs-2`7ZZdmr}v)V#JET(pC43_jxp*TS+0XV-;`cGT%KWaas#a|i9X zYS`FR?!cnl#6xvWAJ1kTEkak~YF8R}l&{Y*6946( z$mz{{Y%Hj9GLaX&HSPnFZE?L8x=wmtZLA!px6mNIpjCqmW+?SAm9+RziSThFq3KYy z)NZmp?eB7a9g5iWmwMb^9_%)SmgNa?ydz}gTx|%GML>d^$@m!S=JM){3Xl;n9^kK< z{&tT!e0&OQ#rCD67QDY(UQwLan5GGEzR3VMV#6fi1-;Gk6n9h`-2*$pT|dn}665%p*_e`{>o4ScLBq7=-By5q2<9ibasee5CAFpNEnNdMO(&_(Q{c$-# zXR>>PxZ%n=H`cY4>n(TFL@Nlp+oH81(vu4pBygo^&imeq$x~wywA7ScOE-BkuR*o} zek{S#jOu&kc90i8``KNL&gs6XlN7eLgP@@h^4mkZ6LNi5VtFf zcSbS(T5iWxT{NqB@YGVsp~WCI7t*etA0m<0XdD)|l>%aT1%uLWMKk5Kz+T2re*u{my3~tCDH16+}ou-7dOPLHFU49=aNm ztXs`Rec?2!PbLKN52)I~>CqTnt1?#a(Z&S}{ula}=s0+;ZKW>dFn8rQGA&vIA5HSL z58(f&3ebhAGO3VCk2oOh{I?Dex!2!7)$Q0=hJaHSldNU!ygV3ftx=2bmui-)MGg4M zRp)zWftuQ<7?>lc|Hv4+sNgqL@JGm0VzWWdgC+a~N9RABE69o47}dIoQ-TRAK~zHx zwCRfMGDyiu3=kaU*zbpAymLjPdh@0M;U!vaCNq-OC=r?`S`?R1y>98Kx6r(p_*n#= zHT{Pv)mU$4X|-g5+VKv*z(Qa5NsFYHjibE0w1;xu;W%cL0J)Wrj(wSSDOpKGz>A9MY-Z#NGV^-3?RY}_&v zzBfnBT1Kg^wz2C1jxm>)4#<=ThfHrnBM0f#@W#yHWqEr(lt*f4;?l;sy6_PA_I)>G z0BEsh7hrNb8C_qY?0hYK_}pB+bJ!WaR8Ef8h2iZoUl1v?RHlsf1H)T4?3BN5j(wwE z^e;${{$G&ZFI`bXLVsZ>$-K6-D%Gf{+1N$9VhgG>_A1mzkg?1U_J+HHlja9^X&839 z(|y*n^q@;c0g~0L&3>R0Rdrc`EXw@W!H~pSwoHp>mr$&dOr!X&K}ole@}4eX{^Kg} z1EjzuM5RkuC36yTDt|=${fcHpyGR+r7;@AheV)*r&6sSsl8+iX#a^{d)OdgY*1nfE zLLO6nRP4gXXa9D%Uyrl5UsX>#;quYvfFx=@|9vG>e+QJ-LCfHN)V#6FQyM&}z3G2J zcn{B5E9iK!-oyPgStX_k2pa#j0fNR4o{+>&Go@$qA$~kDWqy#pJfkOUXzGdL0tVcq zR}ax@~iLhC0O?r*U zUx%`{i}-Tqu~#npE=k~L8%5PwH84MR!7s)NM{2p z#k-Q)Y|y4m;#EJlnkVcZ5FXCp^V{!S-MAZV*`X)=k9-9k!As4HGi7Sy>!!1Wv(41J zVz<1z8(`D&rOF06Jx&*dv8TIgFr)X?;3IGnbLQre%cp#%@~sq`kj=6hGl;%r$##pw ziN-SCC<$z4xmxsx_93U8syoa1JS}Ig%xnJ1wK(}>(z)C$TUoc${4YhObuFqPI~Ls! z+$J{>`Ex`vL8yv^JV_Gh)x0e|A+s!TMDGLU2i4NXs@Dw8XDg&cbGl zT?x=?`IF=ny)2ykex8gm3ZWt6KKOKm0%lL;$=ptObh))^)Kv&9g`d>b6zH-R%mnQ4 zw-8J<;_c-cAO_Co>U6Tm)#wGH0+u8T`acppQ;5`5m{j9hoHp!W|D%EhY7 zW~OwPuG#9FH<(hBjcw?MAXmDlX4#if5u~S+J1Ipn19rB3jCMIHd3c7VzSIIj9XnWO zO~oz$3?Cr;9IK#qNSU4=-E5G-=rgNT2nGgDvL{e-fJn}mNRCg7xID|)qUSvlMLw8; zxrDZfL*ISH51C--VeiDBie2^*>J50oKa1%tW~LbE43C7<`SRYVokp zCWQG6VKEayTq-7#(?#3=c?LW1F9=;6!Q0(?dnT10N--!bsH*)xLUaLy34SsY1#0?B z=;{YxDHEYftpFID^Z)zu0@6xs0BpE# z%^8lDoeo~Vyw}rl$-P;w5uSi70ONJ5!Jl`*-_M?+VA5e`7ZyDhZl?{{sNur`LA|Zi zj%bF)!)r9qj+;Q*6Mf-GDDVDp)>z<3M;d0!IX@x%X{^~FC|%C=e_||elyK-|VY7L^ zU8@vC$Yn8$DH*tD`VN_rYp?KTt}yu1D1u}Xabcdefghijklmnop + +But it could be eight words (16-bit integers): + +
abcdefgh
+ +Or four double words (32-bit integers): + +
abcd
+ +Or two quadwords (64-bit integers): + +
ab
+ +To recap: + +- **b**ytes - 8-bit data +- **w**ords - 16-bit data +- **d**oublewords - 32-bit data +- **q**uadwords - 64-bit data +- **d**ouble **q**uadwords - 128-bit data + +The bold characters will be important later. + +**x86inc.asm include**\ +You’ll see in many examples we include the file x86inc.asm. X86inc.asm is a lightweight abstraction layer used in FFmpeg, x264, and dav1d to make an assembly programmer's life easier. It helps in many ways, but to begin with, one of the useful things it does is it labels GPRs, r0, r1, r2. This means you don’t have to remember any register names. As mentioned before, GPRs are generally just scaffolding so this makes life a lot easier. + +**A simple scalar asm snippet** + +Let’s look at a simple (and very much artificial) snippet of scalar asm (assembly code that operates on individual data items, one at a time, within each instruction) to see what’s going on: + +```wasm +mov r0q, 3 +inc r0q +dec r0q +imul r0q, 5 +``` + +In the first line, the _immediate value_ 3 (a value stored directly in the assembly code itself as opposed to a value fetched from memory) is being stored into register r0 as a quadword. Note that in Intel syntax, the source operand (the value or location providing the data, located on the right) is transferred to the destination operand (the location receiving the data, located on the left), much like the behavior of memcpy. You can also read it as “r0q = 3”, since the order is the same. The “q” suffix of r0 designates the register as being used as a quadword. inc increments the value so that r0q contains 4, dec decrements the value back to 3. imul multiplies the value by 5. So at the end, r0q contains 15. + +Note that the human readable instructions such as mov and inc, which are assembled into machine code by the assembler, are known as _mnemonics_. You may see online and in books mnemonics represented with capital letters like MOV and INC but these are the same as the lower case versions. In FFmpeg, we use lower case mnemonics and keep upper case reserved for macros. + +**Understanding a basic vector function** + +Here’s our first SIMD function: + +```wasm +%include "x86inc.asm" + +SECTION .text + +;static void add_values(uint8_t *src, const uint8_t *src2) +INIT_XMM sse2 +cglobal add_values, 2, 2, 2, src, src2 + movu m0, [srcq] + movu m1, [src2q] + + paddb m0, m1 + + movu [srcq], m0 + + RET +``` + +Let’s go through it line by line: + +```wasm +%include "x86inc.asm" +``` + +This is a “header” developed in the x264, FFmpeg, and dav1d communities to provide helpers, predefined names and macros (such as cglobal below) to simplify writing assembly. + +```wasm +SECTION .text +``` + +This denotes the section where the code you want to execute is placed. This is in contrast to the .data section, where you can put constant data. + +```wasm +;static void add_values(uint8_t *src, const uint8_t *src2) +INIT_XMM sse2 +``` + +The first line is a comment (the semi-colon “;” in asm is like “//” in C) showing what the function argument looks like in C. The second line shows how we are initialising the function to use XMM registers, using the sse2 instruction set. This is because paddb is an sse2 instruction. We’ll cover sse2 in more detail in the next lesson. + +```wasm +cglobal add_values, 2, 2, 2, src, src2 +``` + +This is an important line as it defines a C function called “add_values”. + +Let’s go through each item one at a time: + +- The next parameter shows it has two function arguments. +- The parameter after that shows that we’ll use two GPRs for the arguments. In some cases we might want to use more GPRs so we have to tell x86util we need more. +- The parameter after that tells x86util how many XMM registers we are going to use. +- The following two parameters are labels for the function arguments. + +It’s worth noting that older code may not have labels for the function arguments but instead address GPRs directly using r0, r1 etc. + +```wasm + movu m0, [srcq] + movu m1, [src2q] +``` + +movu is shorthand for movdqu (move double quad unaligned). Alignment will be covered in another lesson but for now movu can be treated as a 128-bit move from \[srcq]. In the case of mov, the brackets mean that the address in \[srcq] is being dereferenced, the equivalent of \*_src in C._ This is what’s known as a load. Note that the “q” suffix refers to the size of the pointer \*(\*i.e in C it represents \*sizeof(\*src) == 8 on 64-bit systems, and x86asm is smart enough to use 32-bit on 32-bit systems) but the underlying load is 128-bit. + +Note that we don’t refer to vector registers by their full name, in this case xmm0,but as m0, an abstracted form. In future lessons you’ll see how this means you can write code once and have it work on multiple SIMD register sizes. + +```wasm +paddb m0, m1 +``` + +paddb (read this in your head as _p-add-b_) is adding each byte in each register as shown below. The “p” prefix stands for “packed” and is used to identify vector instructions vs scalar instructions. The “b” suffix shows that this is bytewise addition (addition of bytes). + +\+ + +\= + +```wasm +movu [srcq], m0 +``` + +This is what’s known as a store. The data is written back to the address in the srcq pointer. + +```wasm +RET +``` + +This is a macro to denote the function returns. Virtually all assembly functions in FFmpeg modify the data in the arguments as opposed to returning a value. + +As you’ll see in the assignment, we create function pointers to assembly functions and use them where available. +eyes to what's actually going on in your computer. **Required Knowledge** -* Knowledge of C, in particular pointers. If you don't know C, work through [The C Programming Language](https://en.wikipedia.org/wiki/The_C_Programming_Language) book -* High School Mathematics (scalar vs vector, addition, multiplication etc) +- Knowledge of C, in particular pointers. If you don't know C, work through [The C Programming Language](https://en.wikipedia.org/wiki/The_C_Programming_Language) book +- High School Mathematics (scalar vs vector, addition, multiplication etc) **Lessons** diff --git a/SUMMARY.md b/SUMMARY.md new file mode 100644 index 0000000..24b4a7d --- /dev/null +++ b/SUMMARY.md @@ -0,0 +1,5 @@ +# Table of contents + +- [Lesson one](README.md) +- [Lesson two](lesson-two.md) +- [Lesson three](lesson-three.md) diff --git a/lesson-three.md b/lesson-three.md new file mode 100644 index 0000000..577aa32 --- /dev/null +++ b/lesson-three.md @@ -0,0 +1,201 @@ +# Lesson three + +Let’s explain some more jargon and give you a short history lesson. + +**Instruction Sets** + +You may have seen in the previous lesson we talked about SSE2 which is a set of SIMD instructions. When a new CPU generation is released it may come with new instructions and sometimes larger register sizes. The history of the x86 instruction set is very complex so this is a simplified history (there are many more subcategories): + +- MMX - Launched in 1997, first SIMD in Intel Processors, 64-bit registers, historic +- SSE (Streaming SIMD Extensions) - Launched in 1999, 128-bit registers +- SSE2 - Launched in 2000, many new instructions +- SSE3 - Launched in 2004, first horizontal instructions +- SSSE3 (Supplemental SSE3) - Launched in 2006, new instructions but most importantly pshufb shuffle instruction, arguably the most important instruction in video processing +- SSE4 - Launched in 2008, many new instructions including packed minimum and maximum. +- AVX - Launched in 2011, 256-bit registers (float only) and new three-operand syntax +- AVX2 - Launched in 2013, 256-bit registers for integer instructions +- AVX512 - Launched in 2017, 512-bit registers, new operation mask feature. These had limited use at the time in FFmpeg because of CPU frequency downscaling when new instructions were used. Full 512-bit shuffle (permute) with vpermb. +- AVX512ICL - Launched 2019, no more clock frequency downscaling. +- AVX10 - Upcoming + +It’s worth noting that instruction sets can be removed as well as added to CPUs. For example AVX512 was [removed](https://www.igorslab.de/en/intel-deactivated-avx-512-on-alder-lake-but-fully-questionable-interpretation-of-efficiency-news-editorial/), controversially, in 12th Generation Intel CPUs. It’s for this reason that FFmpeg does runtime CPU detection. FFmpeg detects the capabilities of the CPU it’s running on. + +As you saw in the assignment, function pointers are C by default and are replaced with a particular instruction set variant. This means detection is done once and then never needs to be done again. This is in contrast to many proprietary applications which hardcode a particular instruction set making a perfectly functional computer obsolete. This also allows optimised functions to be turned on/off at runtime. This is one of the big benefits of open source. + +Programs like FFmpeg are used on billions of devices around the world, some of which may be very old. FFmpeg technically supports machines supporting SSE only, which are 25 years old! Thankfully x86inc.asm is capable of telling you if you use an instruction that’s not available in a particular instruction set. + +To give you an idea of real-world capabilities, here is the instruction set availability from the [Steam Survey](https://store.steampowered.com/hwsurvey/Steam-Hardware-Software-Survey-Welcome-to-Steam) as of November 2024 (this is obviously biased towards gamers): + +| Instruction Set | Availability | +| ------------------------------------------------------------- | ------------ | +| SSE2 | 100% | +| SSE3 | 100% | +| SSSE3 | 99.86% | +| SSE4.1 | 99.80% | +| AVX | 97.39% | +| AVX2 | 94.44% | +| AVX512 (Steam does not separate between AVX512 and AVX512ICL) | 14.09% | + +For an application like FFmpeg with billions of users, even 0.1% is a very large number of users and bug reports if something breaks. FFmpeg has extensive testing infrastructure for testing the variations of CPU/OS/Compiler in our [FATE testsuite](https://fate.ffmpeg.org/?query=subarch:x86_64%2F%2F). Every single commit is run on hundreds of machines to make sure nothing breaks. + +Intel provides a detailed instruction set manual here: [https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html) + +It can be cumbersome to search through a PDF so there is an unofficial web based alternative here: [https://www.felixcloutier.com/x86/](https://www.felixcloutier.com/x86/) + +There is also a visual representation of SIMD instructions available here:[https://www.officedaytime.com/simd512e/](https://www.officedaytime.com/simd512e/) + +Part of the challenge of x86 assembly is finding the right instruction for your needs. In some cases instructions can be used in a way they were not originally intended. + +**Pointer offset trickery** + +Let’s go back to our original function from Lesson 1, but add a width argument to the C function. + +We use ptrdiff_t for the width variable instead of int to make sure that the upper 32-bits of the 64-bit argument are zero. If we directly passed an int width in the function signature, and then attempted to use it as a quad for pointer arithmetic (i.e. using `widthq`) the upper 32-bits of the register can be filled with arbitrary values. We could fix this by sign extending width with `movsxd` (also see macro `movsxdifnidn` in x86inc.asm), but this is an easier way. + +The function below has the pointer offset trickery in it: + +```wasm +;static void add_values(uint8_t *src, const uint8_t *src2, ptrdiff_t width) +INIT_XMM sse2 +cglobal add_values, 3, 3, 2, src, src2, width + add srcq, widthq + add src2q, widthq + neg widthq + +.loop + movu m0, [srcq+widthq] + movu m1, [src2q+widthq] + + paddb m0, m1 + + movu [srcq+widthq], m0 + add widthq, mmsize + jl .loop + + RET +``` + +Let’s go through this step by step as it can be confusing: + +```wasm + add srcq, widthq + add src2q, widthq + neg widthq +``` + +The width is added to each pointer such that each pointer now points to the end of the buffer to be processed. The width is then negated. + +```wasm + movu m0, [srcq+widthq] + movu m1, [src2q+widthq] +``` + +The loads are then done with widthq being negative. So on the first iteration \[srcq+widthq] points to the original address of srcq, i.e points back to the beginning of the buffer. + +```wasm + add widthq, mmsize + jl .loop +``` + +mmsize is added to the negative widthq bringing it closer to zero. The loop condition is now jl (jump if less than zero). This trick means widthq is used as a pointer offset **and** as a loop counter at the same time, saving a cmp instruction. It also allows the pointer offset to be used in multiple loads and stores, as well as using multiples of the pointer offsets if needed (remember this for the assignment). + +**Alignment** + +In all our examples we have been using movu to avoid the topic of alignment. Many CPUs can load and store data faster if the data is aligned, i.e if the memory address is divisible by the SIMD register size. Where possible we try to use aligned loads and stores in FFmpeg using mova. + +In FFmpeg, av_malloc is able to provide aligned memory on the heap and the DECLARE_ALIGNED C preprocessor directive can provide aligned memory on the stack. If mova is used with an unaligned address, it will cause a segmentation fault and the application will crash. It’s also important to be sure that the alignment value corresponds to the SIMD register size, i.e 16 with xmm, 32 for ymm and 64 for zmm. + +Here is how to align the beginning of the RODATA section to 64-bytes: + +```wasm +SECTION_RODATA 64 +``` + +Note that this just aligns the beginning of RODATA. Padding bytes might be needed to make sure the next label remains on a 64-byte boundary. + +**Range expansion** + +Another topic we have avoided until now is overflowing. This happens, for example, when the value of a byte goes beyond 255 after an operation like addition or multiplication. We may want to perform an operation where we need an intermediate value larger than a byte (e.g words), or potentially we want to leave the data in that larger intermediate size. + +For unsigned bytes, this is where punpcklbw (packed unpack low bytes to words) and punpckhbw (packed unpack high bytes to words) comes in. + +Let’s look at how punpcklbw works. The syntax for the SSE2 version from the Intel Manual is as follows: + +```wasm +PUNPCKLBW xmm1, xmm2/m128 +``` + +This means its source (right hand side) can be an xmm register or a memory address (m128 means a memory address with the standard \[base + scale\*index + disp]) syntax and the destination an xmm register. + +The officedaytime.com website above has a good diagram showing what’s going on: + +
+ +You can see that bytes are interleaved from the lower half of each register respectively. But what has this got to do with range extension? If the src register is all zeros this interleaves the bytes in dst with zeros. This is what is known as _zero extension_ as the bytes are unsigned. punpckhbw can be used to do the same thing for the high bytes. + +Here is a snippet showing how this is done: + +```wasm +pxor m2, m2 ; zero out m2 + +movu m0, [srcq] +movu m1, m0 ; make a copy of m0 in m1 +punpcklbw m0, m2 +punpckhbw m1, m2 +``` + +`m0` and `m1` now contain the original bytes zero extended to words. In the next lesson you’ll see how three-operand instructions in AVX make the second movu unnecessary. + +**Sign extension** + +Signed data is a bit more complicated. To range extend a signed integer, we need to use a process known as [sign extension](https://en.wikipedia.org/wiki/Sign_extension). This pads the MSBs with the sign bit. For example: -2 in int8_t is 0b11111110. To sign extend it to int16_t the MSB of 1 is repeated to make 0b1111111111111110. + +`pcmpgtb` (packed compare greater than byte) can be used for sign extension. By doing the comparison (0 > byte), all the bits in the destination byte are set to 1 if the byte is negative, otherwise the bits in the destination byte are set to 0. punpckX can be used as above to perform the sign extension. If the byte is negative the corresponding byte is 0b11111111 and otherwise it’s 0x00000000. Interleaving the byte value with the output of pcmpgtb performs a sign extension to word as a result. + +```wasm +pxor m2, m2 ; zero out m2 + +movu m0, [srcq] +movu m1, m0 ; make a copy of m0 in m1 + +pcmpgtb m2, m0 +punpcklbw m0, m2 +punpckhbw m1, m2 +``` + +As you can see there is an extra instruction compared to the unsigned case. + +**Packing** + +packuswb (pack unsigned word to byte) and packsswb lets you go from word to byte. It lets you interleave two SIMD registers containing words into one SIMD register with a byte. Note that if the values exceed the byte range, they will be saturated (i.e clamped at the largest value). + +**Shuffles** + +Shuffles, also known as permutes, are arguably the most important instruction in video processing and pshufb (packed shuffle bytes), available in SSSE3, is the most important variant. + +For each byte the corresponding source byte is used as an index of the destination register, except when the MSB is set the destination byte is zeroed. It’s analogous to the following C code (although in SIMD all 16 loop iterations happen in parallel): + +```c +for(int i = 0; i < 16; i++) { + if(src[i] & 0x80) + dst[i] = 0; + else + dst[i] = dst[src[i]] +} +``` + +Here’s a simple assembly example: + +```wasm +SECTION_DATA 64 + +shuffle_mask: db 4, 3, 1, 2, -1, 2, 3, 7, 5, 4, 3, 8, 12, 13, 15, -1 + +section .text + +movu m0, [srcq] +movu m1, [shuffle_mask] +pshufb m0, m1 ; shuffle m0 based on m1 +``` + +Note that -1 for easy reading is used as the shuffle index to zero out the output byte: -1 as a byte is the 0b11111111 bitfield (two’s complement), and thus the MSB (0x80) is set. diff --git a/lesson_02/index.md b/lesson-two.md similarity index 63% rename from lesson_02/index.md rename to lesson-two.md index 4e06da0..a1dadad 100644 --- a/lesson_02/index.md +++ b/lesson-two.md @@ -1,21 +1,21 @@ -**FFmpeg Assembly Language Lesson Two** +# Lesson two Now that you’ve written your first assembly language function, we will now introduce branches and loops. -We need to first introduce the idea of labels and jumps. In the artificial example below, the jmp instruction moves the code instruction to after “.loop:”. “.loop:” is known as a *label*, with the dot prefixing the label meaning it’s a *local label*, effectively allowing you to reuse the same label name across multiple functions. This example, of course, shows an infinite loop, but we’ll extend this later to something more realistic. +We need to first introduce the idea of labels and jumps. In the artificial example below, the jmp instruction moves the code instruction to after “.loop:”. “.loop:” is known as a _label_, with the dot prefixing the label meaning it’s a _local label_, effectively allowing you to reuse the same label name across multiple functions. This example, of course, shows an infinite loop, but we’ll extend this later to something more realistic. -```assembly +```wasm mov r0q, 3 .loop: dec r0q jmp .loop ``` -Before making a realistic loop we have to introduce the *FLAGS* register. We won’t dwell on the intricacies of *FLAGS* too much (again because GPR operations are largely scaffolding) but there are several flags such as Zero-Flag, Sign-Flag and Overflow-Flag which are set based on the output of most non-mov instructions on scalar data such as arithmetic operations and shifts. +Before making a realistic loop we have to introduce the _FLAGS_ register. We won’t dwell on the intricacies of _FLAGS_ too much (again because GPR operations are largely scaffolding) but there are several flags such as Zero-Flag, Sign-Flag and Overflow-Flag which are set based on the output of most non-mov instructions on scalar data such as arithmetic operations and shifts. Here’s an example where the loop counter counts down until zero and jg (jump if greater than zero) is the loop condition. dec r0q sets the FLAGs based on the value of r0q after the instruction and you can jump based on them. -```assembly +```wasm mov r0q, 3 .loop: ; do something @@ -34,7 +34,7 @@ do } while(i > 0) ``` -This C code is a bit unnatural. Usually a loop in C is written like this: +This C code is a bit unnatural. Usually a loop in C is written like this: ```c int i; @@ -43,7 +43,7 @@ for(i = 0; i < 3; i++) { } ``` -This is roughly equivalent to (there's no simple way of matching this ```for``` loop): +This is roughly equivalent to (there's no simple way of matching this `for` loop): ```assembly xor r0q, r0q @@ -54,35 +54,35 @@ xor r0q, r0q jl .loop ; jump if (r0q - 3) < 0, i.e (r0q < 3) ``` -There are several things to point out in this snippet. First is ```xor r0q, r0q``` which is a common way of setting a register to zero, that on some systems is faster than ```mov r0q, 0```, because, put simply, there is no actual load taking place. It can also be used on SIMD registers with ```pxor m0, m0``` to zero out an entire register. The next thing to note is the use of cmp. cmp effectively subtracts the second register from the first (without storing the value anywhere) and sets *FLAGS*, but as per the comment, it can be read together with the jump, (jl = jump if less than zero) to jump if ```r0q < 3```. +There are several things to point out in this snippet. First is `xor r0q, r0q` which is a common way of setting a register to zero, that on some systems is faster than `mov r0q, 0`, because, put simply, there is no actual load taking place. It can also be used on SIMD registers with `pxor m0, m0` to zero out an entire register. The next thing to note is the use of cmp. cmp effectively subtracts the second register from the first (without storing the value anywhere) and sets _FLAGS_, but as per the comment, it can be read together with the jump, (jl = jump if less than zero) to jump if `r0q < 3`. -Note how there is one extra instruction (cmp) in this snippet. Generally speaking, fewer instructions means faster code, which is why the earlier snippet is preferred. As you’ll see in future lessons, there are more tricks used to avoid this extra instruction and have *FLAGS* be set by arithmetic or another operation. Note how we are not writing assembly to match C loops exactly, we write loops to make them as fast as possible in assembly. +Note how there is one extra instruction (cmp) in this snippet. Generally speaking, fewer instructions means faster code, which is why the earlier snippet is preferred. As you’ll see in future lessons, there are more tricks used to avoid this extra instruction and have _FLAGS_ be set by arithmetic or another operation. Note how we are not writing assembly to match C loops exactly, we write loops to make them as fast as possible in assembly. -Here are some common jump mnemonics you’ll end up using (*FLAGS* are there for completeness, but you don’t need to know the specifics to write loops): +Here are some common jump mnemonics you’ll end up using (_FLAGS_ are there for completeness, but you don’t need to know the specifics to write loops): -| Mnemonic | Description | FLAGS | -| :---- | :---- | :---- | -| JE/JZ | Jump if Equal/Zero | ZF = 1 | -| JNE/JNZ | Jump if Not Equal/Not Zero | ZF = 0 | -| JG/JNLE | Jump if Greater/Not Less or Equal (signed) | ZF = 0 and SF = OF | -| JGE/JNL | Jump if Greater or Equal/Not Less (signed) | SF = OF | -| JL/JNGE | Jump if Less/Not Greater or Equal (signed) | SF ≠ OF | -| JLE/JNG | Jump if Less or Equal/Not Greater (signed) | ZF = 1 or SF ≠ OF | +| Mnemonic | Description | FLAGS | +| -------- | ------------------------------------------ | ------------------ | +| JE/JZ | Jump if Equal/Zero | ZF = 1 | +| JNE/JNZ | Jump if Not Equal/Not Zero | ZF = 0 | +| JG/JNLE | Jump if Greater/Not Less or Equal (signed) | ZF = 0 and SF = OF | +| JGE/JNL | Jump if Greater or Equal/Not Less (signed) | SF = OF | +| JL/JNGE | Jump if Less/Not Greater or Equal (signed) | SF ≠ OF | +| JLE/JNG | Jump if Less or Equal/Not Greater (signed) | ZF = 1 or SF ≠ OF | **Constants** Let’s look at some examples showing how to use constants: -```assembly +```wasm SECTION_RODATA constants_1: db 1,2,3,4 constants_2: times 2 dw 4,3,2,1 ``` -* SECTION_RODATA specifies this is a read-only data section. (This is a macro because different output file formats that operating systems use declare this differently) -* constants_1: The label constants_1, is defined as ```db``` (declare byte) - i.e equivalent to uint8_t constants_1[4] = {1, 2, 3, 4}; -* constants_2: This uses the ```times 2``` macro to repeat the declared words - i.e equivalent to uint16_t constants_2[8] = {4, 3, 2, 1, 4, 3, 2, 1}; +- SECTION_RODATA specifies this is a read-only data section. (This is a macro because different output file formats that operating systems use declare this differently) +- constants_1: The label constants_1, is defined as `db` (declare byte) - i.e equivalent to uint8_t constants_1\[4] = {1, 2, 3, 4}; +- constants_2: This uses the `times 2` macro to repeat the declared words - i.e equivalent to uint16_t constants_2\[8] = {4, 3, 2, 1, 4, 3, 2, 1}; These labels, which the assembler converts to a memory address, can then be used in loads (but not stores as they are read-only). Some instructions take a memory address as an operand so they can be used without explicit loads into a register (there are pros and cons to this). @@ -104,14 +104,14 @@ The 4-byte offset between elements of data is precalculated by the C compiler. B Let’s look at the syntax for memory address calculations. This applies to all types of memory addresses: -```assembly +```wasm [base + scale*index + disp] ``` -* base - This is a GPR (usually a pointer from a C function argument) -* scale - This can be 1, 2, 4, 8. 1 is the default -* index - This is a GPR (usually a loop counter) -* disp - This is an integer (up to 32-bit). Displacement is an offset into the data +- base - This is a GPR (usually a pointer from a C function argument) +- scale - This can be 1, 2, 4, 8. 1 is the default +- index - This is a GPR (usually a loop counter) +- disp - This is an integer (up to 32-bit). Displacement is an offset into the data x86asm provides the constant mmsize, which lets you know the size of the SIMD register you are working with. @@ -135,7 +135,7 @@ jg .loop RET ``` -Note how in ```movu m1, [srcq+2*r1q+3+mmsize]``` the assembler will precalculate the right displacement constant to use. In the next lesson we’ll show you a trick to avoid having to do add and dec in the loop, replacing them with a single add. +Note how in `movu m1, [srcq+2*r1q+3+mmsize]` the assembler will precalculate the right displacement constant to use. In the next lesson we’ll show you a trick to avoid having to do add and dec in the loop, replacing them with a single add. **LEA** @@ -151,7 +151,7 @@ Contrary to the name, LEA can be used for normal arithmetic as well as address c lea r0q, [r1q + 8*r2q + 5] ``` -Note that this does not affect the contents of r1q and r2q. It also doesn’t affect *FLAGS* (so you can’t jump based on the output). Using LEA avoids all these instructions and temporary registers (this code is not equivalent because add changes *FLAGS*): +Note that this does not affect the contents of r1q and r2q. It also doesn’t affect _FLAGS_ (so you can’t jump based on the output). Using LEA avoids all these instructions and temporary registers (this code is not equivalent because add changes _FLAGS_): ```assembly movq r0q, r1q @@ -164,5 +164,3 @@ add r0q, r3q You’ll see lea used a lot to set up addresses before loops or perform calculations like the above. Note of course, that you can’t do all types of multiply and addition, but multiplications by 1, 2, 4, 8 and addition of a fixed offset is common. In the assignment you’ll have to load a constant and add the values to a SIMD vector in a loop. - -[Next Lesson](../lesson_03/index.md) diff --git a/lesson_01/index.md b/lesson_01/index.md deleted file mode 100644 index 7b911a0..0000000 --- a/lesson_01/index.md +++ /dev/null @@ -1,213 +0,0 @@ -**FFmpeg Assembly Language Lesson One** - -**Introduction** - -Welcome to the FFmpeg School of Assembly Language. You have taken the first step on the most interesting, challenging, and rewarding journey in programming. These lessons will give you a grounding in the way assembly language is written in FFmpeg and open your eyes to what's actually going on in your computer.. - -**Required Knowledge** - -* Knowledge of C, in particular pointers. If you don't know C, work through [The C Programming Language](https://en.wikipedia.org/wiki/The_C_Programming_Language) book -* High School Mathematics (scalar vs vector, addition, multiplication etc) - -**What is assembly language?** - -Assembly language is a programming language where you write code that directly corresponds to the instructions a CPU processes. Human readable assembly language is, as the name suggests, *assembled* into binary data, known as *machine code*, that the CPU can understand. You might see assembly language code referred to as “assembly” or “asm” for short. - -The vast majority of assembly code in FFmpeg is what's known as *SIMD, Single Instruction Multiple Data*. SIMD is sometimes referred to as vector programming. This means that a particular instruction operates on multiple elements of data at the same time. Most programming languages operate on one data element at a time, known as scalar programming. - -As you might have guessed, SIMD lends itself well to processing images, video, and audio which have lots of data ordered sequentially in memory. There are specialist instructions available in the CPU to help us process sequential data. - -In FFmpeg, you'll see the terms “assembly function”, “SIMD”, and “vector(ise)” used interchangeably. They all refer to the same thing: Writing a function in assembly language by hand to process multiple elements of data in one go. Some projects may also refer to these as “assembly kernels”. - -All of this might sound complicated, but it's important to remember that in FFmpeg, high schoolers have written assembly code. As with everything, learning is 50% jargon and 50% actual learning. - -**Why do we write in assembly language?** -To make multimedia processing fast. It’s very common to get a 10x or more speed improvement from writing assembly code, which is especially important when wanting to play videos in real time without stuttering. It also saves energy and extends battery life. It’s worth pointing out that video encode and decode functions are some of the most heavily used functions on earth, both by end-users and by big companies in their datacentres. So even a small improvement adds up quickly. - -You’ll often see, online, people use *intrinsics,* which are C-like functions that map to assembly instructions to allow for faster development. In FFmpeg we don’t use intrinsics but instead write assembly code by hand. This is an area of controversy, but intrinsics are typically around 10-15% slower than hand-written assembly (intrinsics supporters would disagree), depending on the compiler. For FFmpeg, every bit of extra performance helps, which is why we write in assembly code directly. There’s also an argument that intrinsics are difficult to read owing to their use of “[Hungarian Notation](https://en.wikipedia.org/wiki/Hungarian_notation)”. - -You may also see *inline assembly* (i.e. not using intrinsics) remaining in a few places in FFmpeg for historical reasons, or in projects like the Linux Kernel because of very specific use cases there. This is where assembly code is not in a separate file, but written inline with C code. The prevailing opinion in projects like FFmpeg is that this code is hard to read, not widely supported by compilers and unmaintainable. - -Lastly, you’ll see a lot of self-proclaimed experts online saying none of this is necessary and the compiler can do all of this “vectorisation” for you. At least for the purpose of learning, ignore them: recent tests in e.g. [the dav1d project](https://www.videolan.org/projects/dav1d.html) showed around a 2x speedup from this automatic vectorisation, while the hand-written versions could reach 8x. - -**Flavours of assembly language** -These lessons will focus on x86 64-bit assembly language. This is also known as amd64, although it still works on Intel CPUs. There are other types of assembly for other CPUs like ARM and RISC-V and potentially in the future these lessons will be extended to cover those. - -There are two flavours of x86 assembly syntax that you’ll see online: AT&T and Intel. AT&T Syntax is older and harder to read compared to Intel syntax. So we will use Intel syntax. - -**Supporting materials** -You might be surprised to hear that books or online resources like Stack Overflow are not particularly helpful as references. This is in part because of our choice to use handwritten assembly with Intel syntax. But also because a lot of online resources are focused on operating system programming or hardware programming, usually using non-SIMD code. FFmpeg assembly is particularly focused on high performance image processing, and as you’ll see it’s a particularly unique approach to assembly programming. That said, it’s easy to understand other assembly use-cases once you’ve completed these lessons - -Many books go into a lot of computer architecture details before teaching assembly. This is fine if that’s what you want to learn, but from our standpoint, it’s like studying engines before learning to drive a car. - -That said, the diagrams in the later parts of “The Art of 64-bit assembly” book showing SIMD instructions and their behaviour in a visual form are helpful: [https://artofasm.randallhyde.com/](https://artofasm.randallhyde.com/) - -A discord server is available to answer questions: -[https://discord.com/invite/Ks5MhUhqfB](https://discord.com/invite/Ks5MhUhqfB) - -**Registers** -Registers are areas in the CPU where data can be processed. CPUs don’t operate on memory directly, but instead data is loaded into registers, processed, and written back to memory. In assembly language, generally, you cannot directly copy data from one memory location to another without first passing that data through a register. - -**General Purpose Registers** -The first type of register is what is known as a General Purpose Register (GPR). GPRs are referred to as general purpose because they can contain either data, in this case up to a 64-bit value, or a memory address (a pointer). A value in a GPR can be processed through operations like addition, multiplication, shifting, etc. - -In most assembly books, there are whole chapters dedicated to the subtleties of GPRs, the historical background etc. This is because GPRs are important when it comes to operating system programming, reverse engineering, etc. In the assembly code written in FFmpeg, GPRs are more like scaffolding and most of the time their complexities are not needed and abstracted away. - -**Vector registers** -Vector (SIMD) registers, as the name suggests, contain multiple data elements. There are various types of vector registers: - -* mm registers - MMX registers, 64-bit sized, historic and not used much any more -* xmm registers - XMM registers, 128-bit sized, widely available -* ymm registers - YMM registers, 256-bit sized, some complications when using these -* zmm registers - ZMM registers, 512-bit sized, limited availability - -Most calculations in video compression and decompression are integer-based so we’ll stick to that. Here’s an example of 16 bytes in an xmm register: - -| a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | -| :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | - -But it could be eight words (16-bit integers) - -| a | b | c | d | e | f | g | h | -| :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | - -Or four double words (32-bit integers) - -| a | b | c | d | -| :---- | :---- | :---- | :---- | - -Or two quadwords (64-bit integers): - -| a | b | -| :---- | :---- | - -To recap: - - -* **b**ytes - 8-bit data -* **w**ords - 16-bit data -* **d**oublewords - 32-bit data -* **q**uadwords - 64-bit data -* **d**ouble **q**uadwords - 128-bit data - -The bold characters will be important later. - -**x86inc.asm include** -You’ll see in many examples we include the file x86inc.asm. X86inc.asm is a lightweight abstraction layer used in FFmpeg, x264, and dav1d to make an assembly programmer's life easier. It helps in many ways, but to begin with, one of the useful things it does is it labels GPRs, r0, r1, r2. This means you don’t have to remember any register names. As mentioned before, GPRs are generally just scaffolding so this makes life a lot easier. - -**A simple scalar asm snippet** - -Let’s look at a simple (and very much artificial) snippet of scalar asm (assembly code that operates on individual data items, one at a time, within each instruction) to see what’s going on: - -```assembly -mov r0q, 3 -inc r0q -dec r0q -imul r0q, 5 -``` - -In the first line, the *immediate value* 3 (a value stored directly in the assembly code itself as opposed to a value fetched from memory) is being stored into register r0 as a quadword. Note that in Intel syntax, the source operand (the value or location providing the data, located on the right) is transferred to the destination operand (the location receiving the data, located on the left), much like the behavior of memcpy. You can also read it as “r0q = 3”, since the order is the same. The “q” suffix of r0 designates the register as being used as a quadword. inc increments the value so that r0q contains 4, dec decrements the value back to 3. imul multiplies the value by 5. So at the end, r0q contains 15. - -Note that the human readable instructions such as mov and inc, which are assembled into machine code by the assembler, are known as *mnemonics*. You may see online and in books mnemonics represented with capital letters like MOV and INC but these are the same as the lower case versions. In FFmpeg, we use lower case mnemonics and keep upper case reserved for macros. - -**Understanding a basic vector function** - -Here’s our first SIMD function: - -```assembly -%include "x86inc.asm" - -SECTION .text - -;static void add_values(uint8_t *src, const uint8_t *src2) -INIT_XMM sse2 -cglobal add_values, 2, 2, 2, src, src2 - movu m0, [srcq] - movu m1, [src2q] - - paddb m0, m1 - - movu [srcq], m0 - - RET -``` - -Let’s go through it line by line: - -```assembly -%include "x86inc.asm" -``` - -This is a “header” developed in the x264, FFmpeg, and dav1d communities to provide helpers, predefined names and macros (such as cglobal below) to simplify writing assembly. - -```assembly -SECTION .text -``` - -This denotes the section where the code you want to execute is placed. This is in contrast to the .data section, where you can put constant data. - -```assembly -;static void add_values(uint8_t *src, const uint8_t *src2) -INIT_XMM sse2 -``` - -The first line is a comment (the semi-colon “;” in asm is like “//” in C) showing what the function argument looks like in C. The second line shows how we are initialising the function to use XMM registers, using the sse2 instruction set. This is because paddb is an sse2 instruction. We’ll cover sse2 in more detail in the next lesson. - -```assembly -cglobal add_values, 2, 2, 2, src, src2 -``` - -This is an important line as it defines a C function called “add_values”. - -Let’s go through each item one at a time: - -* The next parameter shows it has two function arguments. -* The parameter after that shows that we’ll use two GPRs for the arguments. In some cases we might want to use more GPRs so we have to tell x86util we need more. -* The parameter after that tells x86util how many XMM registers we are going to use. -* The following two parameters are labels for the function arguments. - -It’s worth noting that older code may not have labels for the function arguments but instead address GPRs directly using r0, r1 etc. - -```assembly - movu m0, [srcq] - movu m1, [src2q] -``` - -movu is shorthand for movdqu (move double quad unaligned). Alignment will be covered in another lesson but for now movu can be treated as a 128-bit move from [srcq]. In the case of mov, the brackets mean that the address in [srcq] is being dereferenced, the equivalent of **src in C.* This is what’s known as a load. Note that the “q” suffix refers to the size of the pointer *(*i.e in C it represents *sizeof(*src) == 8 on 64-bit systems, and x86asm is smart enough to use 32-bit on 32-bit systems) but the underlying load is 128-bit. - -Note that we don’t refer to vector registers by their full name, in this case xmm0,but as m0, an abstracted form. In future lessons you’ll see how this means you can write code once and have it work on multiple SIMD register sizes. - -```assembly -paddb m0, m1 -``` - -paddb (read this in your head as *p-add-b*) is adding each byte in each register as shown below. The “p” prefix stands for “packed” and is used to identify vector instructions vs scalar instructions. The “b” suffix shows that this is bytewise addition (addition of bytes). - -| a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | -| :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | - -\+ - -| q | r | s | t | u | v | w | x | y | z | aa | ab | ac | ad | ae | af | -| :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | - -\= - -| a+q | b+r | c+s | d+t | e+u | f+v | g+w | h+x | i+y | j+z | k+aa | l+ab | m+ac | n+ad | o+ae | p+af | -| :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | :---- | - -```assembly -movu [srcq], m0 -``` - -This is what’s known as a store. The data is written back to the address in the srcq pointer. - -```assembly -RET -``` - -This is a macro to denote the function returns. Virtually all assembly functions in FFmpeg modify the data in the arguments as opposed to returning a value. - -As you’ll see in the assignment, we create function pointers to assembly functions and use them where available. - -[Next Lesson](../lesson_02/index.md) diff --git a/lesson_03/image1.png b/lesson_03/image1.png deleted file mode 100644 index 9da3b085097fb5e154bc8f34b686b79bfaa4b737..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11246 zcma)ibyQs6(j^YTNpJ{Gum*x#a1ZVl+}$O3aM$4O4uQrA?(QzZ-QD?ae($}RZ`S(O z%>2=P`>xgJo^z_st=hF~hsw)}qaflVLO?*ENJ@w(LO?){0>5h!pn&i1Uh8+j1=3Mb zTnM6ajPL-sfiV@75rlxKjz)UchXwB6*-26~Ps zsVn!szD0-RmyqNa+grI|fM|Av}H^NNrXRU3eqwp>3!I+!-mSP`EtqT5#c zFpy2~up|DZceF|`#OFRiFV(;=AB2EO`??byBU0Qxo}QwexN5w}Vw$AAe&v2UnZni5 zo$hF|c9k+dlESjS*6IP}M}lC$Znf}+;71bjT_8gg9fpaq7!2I)!a8FMqWApH9wS5W zOU9YefDuH$^5YH%{;wl%iUD)Ls7x`InW5fE#*1(W`a4?iKclr{3D`(z)p(?#LPNpu zA;oYc?jHsb0t<_4NQD!NspHIIW0CwwGJJ!|{OT1@s-UamSqSkxus10Y;<1G4so+|4cbBH8O%H`lq#Z!o@NYzUKRBE4^h8bTJp;&PY<+6@KpXzxINLii36U zhUw%Wozq1I_N4IBIPg&Qh^Qh26IOM$Fy{pkF)+HgX**o-m064RuzWFO1cQek3jY^^ z!H(}J?ULl+VpDbnpWy4OAYXP<)8Zfrc$34Z)@()tI))BanWB#>+&*^O?GQv#()NM3 zhc!dwu0v1v))Gdr94$JutyHyaF;O%pa)^PFDLS-ZF?6ROcVx(FvKXwrk@fRK3ZX#I zyWzlfi9`euIHY08Qt=46sAjckNvG&nP>%95o9~qlf`M_PIPvI(eiX!Ly>tM36Dwy7 z>t@ILR5I=_tAo3pTC_zrr17`&$Hfoh**>BZSLj>jpE<+$z6kX(#wNEWL2C>$Zhv~>x%RGDkPc2;9q68U6PP|DR{^(SK~}u4r%pM%7a{?J!b6 zpdV=w0Uy3uga;mObRZcSfjhG?XPC2c{m?0S-P!fp zuZcZ@$!K?>=&Ywzx~bdo2S?me`DRVo66ZHquBQp8=Dv%9T=)cDO5AJuldFAT?HveX ztxUZ1?6gWWuas>Yl~!EY6p|jR6P=A~llNHs3L^pAIo?(Ikk^^ANqfa;FTE3PG@16W z3Xlz)0(!3g$17IlGnUym^O21|n@7CbON0jbT7(18+eMFkSp|%e-encnxMwlOpRbn2 z^^6-!63*m{?!Z`~>N6c+dj9+%55mqdT)G@ zg&>dKy7|28ze&k^pNLLOSGW9&JtI&ouZBv(k-}O``fC6JZ~3PQGoQ7$NlK4RCo@4yD38n+a*8LARUkU$cAeadq!(y) z-)lw=tn`e0=3r9RQ1~+K<%jz-fEzxk9Y*?tlAAgvV^upO;-!+XYLx>nntU$55X`=xu%Y!qwhj=Fd3G5&J7%&QJ zm@Swfa6%Iy84@0`xPSkervj^wD=O#=36T+fh*b0pYblC86t#hPR(^SJumx-o&a_4? z`6o@p8Pm1m`5^a3#r(sPv&`(`!CpggU(rh5>kJkM9#8mnp$e*=*?2?rgaZPJmtR^| zhwaZUrkq298a|$~YKAE(28ZiluC22-w85xv(D#m*WL4KGIC94sl`L2l?DUO1%*&7Q zvPbB^GaS-+toO*%mWm?xP;oA($Ez*!l5_ z$1UHM8BdB2E=&2?-%V4+$-P~5Pw{0-PTb<>E3yptXj zpP0Yk6-)?E7xKzIF898ob7-&k@fob=8TU5Rg^)TJ6o^kQQs;!v_w}H>xiHW+80JW|TvZXOk7YsG96ANW8^4Womt9fRH+Tl^JFD^>%AGi zXfgryZ~+3|eIU{v9p~@V*@XVU2;&*dQIa{P{*mZUHu`1Vayy>G3GleR(+*H!6y9OnkYhPkTr}!#D!?->&F5M1E~a1aVz5QRU8o<*(QDk)K*_#M zRW=MhDyotLH<$s)DxZE!?d)@NStL7Fes+8}{O6?3CPy8)^2zUs%%yAT_7B+V6A1NA@Qfh)ZXc{wK%1V20fE4`p!1V+O`0Cv!3G3vp^ zW`+XNL%BNH#-KZl;3K^-p*T1ORSH|$m=tTDLjXt@((61APwbD?60X-S9`z^FHymm; za?`JtAS^+Ukk->GV|y82$?wk1kXgHe4Hrpg-tcSAR&CQ0oyG0iaBm^&n!DP$YA2Wj zbHuNQI*YenvkANQd8ek5A+z$=S9OMaAd`?A_cyE6jZY@yg$>mdFb5hP@|sr{XisuF zo@YDe_QRe;_)GyG1x^VfPE4 zGv2$+FY_|`EJQ@S>AkSTZ^aO$$F=yv+JBc3}KKk2atzp%P!An zP0l4$S?Bl(wI*}r;zUO9=tt0Sgf@mjXdkVrRnl6&nNp!>aXmJ;inez$blQB95=Cd3 zn)AL3@5afk$BDwVNUH7QSii78hA^l|FDbpVw>QF&@;z!s7lGoUvbZB`f|xrBzanxW zH0f8wI5P2Cl5EtCr%G0|c$#fyfH@$QrL`5YX5@BxUGiX*(|U>E2(z+<>?2M{vi%?a)`4)8#y>97DC6hWh}IMM5EI zOrk30GRe5H-1e6Da(X+T>1O`go%qp#%)$3{YI95jf)3r!3GmviTKz}*3iErlo-K@U zUi+G|24P~kUv6@tz!L%->mqIQ(+3{gOitoL!(wQ*#88dlW7Z;!i5pb>xx=qrD_K{f z#yP<|!&7^ixNrJBTfjz=IF*dQ?}31QUh4uLQM^s6dbo%G{N1G@h0WC=iHC!{H8ye1 zWnZL1|0B^MXo@(8pcmK%Zh+GK*t^sls`TSS4|SI_T$x`sq%Tp=n^I1B3uwKA3`jPN ze#r?+2^^S>z8Rp#uBL8`mZplzC|QbgS--py z!`?H@t$-!xRP%3q(USIa_LrFIGG zANb7g+OO|YNH6vXPiOAyKf&QT3E}$b(h=fO@sqpb!hP>SJOAAPboZIz#c7OY#Juav z`Q#8kI0Vu8ljbb)iAH{v!u6HPjaap%|EEnb_$7v*!8_O|5=X=FuU9|ri&c9>DT^}P z_Cy*yv0&k(tz?{N^dJJo$|IB-zKx{~CQtVkqzlAB($5)I>H4SyFy@R#zN;`tm>2mD zj16XZv4W1(wO#$WPDrYH%9tv}7!i8HFd2eDNd0X~qXH;*xR-!lwi_O;nqXj4)fe{A z*k%Tz@3;psv2S1%e=9zId)xphVajhjS{J9z6z*@dMl`qfQk(FTs$5W!Q)8=Prw2vy z2)UDL=$xc-+w8>la^9SpImKUase|lcX&xUAhfYMT@)kUZxl!83_p=U#H+AQ`bIk6l zZ}GGCpR;xNjcL4YNnioRSi^xj#6 zf%?6tmp}BXM(V4TW8N2L{{&Wz6nXsdX z3UX)@KQa%lmU8nL`=NvDJOZcFB$b~Dv)D~C+6H?qG;Yb-sSKQLQHq*bi>4Ubn=cZAPVyrkoP7wd zO@y_Qm&fKb*sml^vlUN{HHm)49-Maj( zlJEjriWhAW)cy1-DL%LZ1=NUti@9u4WRp!I++(^zC;zqSSf_+*PcDwMRgVr{O$?3X zCb{T|rG=$l)AYq#=agNsYCBUTa({J~SHV~zb1rdLi?6iP?-hlyuikxI`*D4~>d-T3 zq+BTKgB8;_lpxBbj`=o9i^HT`@r7@-TL$^7P=^2Vv#Dtqjabf3dPa+=jt?~9QW6F} zj#yRTQR!?>zvE>=O6}?)h?T$EAO3<|722G5q7=z+DCD77`dkJTVmSa79Ey(5R~j@> zS`~Cty21SU6TDfndqROy{cH8R0!Zb&B&4hokwI30CTi&y$}VM9ozV5n3FeNBST7fm zDprrm%>3NnKLbU+e6pZx8|@GjCdS=cKdxv|C4ncnQKc2`2G8rk1ok)^VRZh5h!Gn(qkEub@N%BRb4q$Ro*Wy2FS3Xyb>@ z72ptH2CIY#SheB$wguYq=nyp*{StdIA3sqKA|&KpIHOb-<8j$p=hM;Cq(l#SZ;5N8 z?)13Xj8lk$6#7LQh=9;?JEntfbAMBw8ENnmKOloH`rhdMX(L8rD|>$Yk5sq}E~jiy zUMgW$6T@TxMn}i1PLsYBl*e2fPX$iV9}_$}Zng~|>^OW-JBjczowCpvlCTRMD5B)r zuNIB-S3yF5aXTw_+iI@N^0BTB!l5?P5Aea37Qf8WoD;ZM;FK{OD8%OL7MH}0_-w#x zhO76H0a~bm3qrFQW#1@`Ks1*EbxWBNx4tG-PVF2;Ip?0%pOYjdU^_JbM^3%Muh&5f z<-~797(nclNgC3!nOSS`2rje~B*Z}8S8{&(bE@b4OyM(SKUhYqH}Z=6%9_PgHkw;x z+=L0MFQ^w{S082wReJbLL+T96CORGJJi>I97b+$4Z6I3r<8CSF|6Rm}UJ;4$R2 zM%n<5BNi$-ugH|kia>sc8@`}RLG|X&&F_8kSMVEi-(%{o0k)9DWptj8tZ+ASHOb6 z{fOP?_6zyi^oTf(B&I3^ophsiZl5?`wa8)=P3D8bLpe$|BmCh0ph)+Vl{6CsZwk7? zbdw9*8uOqE7|LZ4T&E*SX698;-f|- zWi3}ARmGF^xLeOF&Vc#i>AaZ!P7wUsqbOQ_@~zy?gSO# zBt(z8YAnp#f(NuMEes4k0CunjJI+z4tD44oqX20-8GX64`%C)A+H|lg)FH?K(k@TE zXL|t!ukxX>LPh`J`)4{~-pL=y?+CP7o-)#{;n7;XuBOgLIW%bf2KqV|TOKq=GwPQ? zHYUiwgE4-Ue*gSX>($)$z>1Ux(IU7{$cnRF5h81N((k@zZfSjAhb)TCL@M8sjdaEo zn}GO;!d-0;s^C#VGKH(>nDtVOu|>6r{L8%A?bW5*cq4O#mm+5k$Eq;C^JZD@Otj2WkM#$H&#Qd;aWR&-{jgdoTOi z7U>O|`sme}S;4bX>+m9T|Jz6X?dmKTAjWt9HzBxCla z^(x5hhg~UphAVnI8&%?yt&)CE9zqFmF!+){M@>I+m**OD@O$2KZCp ziAV!MU%m*pF9y}7wmvS{OM8+#k+siZH|A8GRu7=|+?_j@H@vY4A73u;Bc5%$xgjx7 z4|)+)vr?+u#5>>K2BkGJ+-%>+3g0X>Dbi4vI1<`$9AU_M9Uyu8SBIaP?9~bPe2?v3 zzN*+a?t1wVclfxya;YzDmGsnfX8}I_+HGxCIiUj7ZB@3fUSVbNN&D#gWn(*h&?}8M zB}IJNJZZh6B=+5VMOl~M_kkA!Oo7*-jxqx5`t*7|m!ch#4=u>f0jSQ@J?rMZ!_|*F zSI4yyfjVf<+ZseUHW!rR5C5aouU^*Px#z~T6+wFsq@shn)&0w#a*(`bAE%fCun~v> zpzV`!pb2_4x%85KClN0-HI>j1k#^sil|>IDoL!LH;iTydC6-Se*vMK)2A=wK@Ac|? zpBBE?C}yE+z-hg^RIXUVamHrC0gAkdhPS;1AO=T!c#j3>Yy`s|!BNzGb3&NWu}wKX z+}}Bd5ToOUd4%gsi^{Le5H2&2bbf)?O)N)AW$j+q)RfFWW(@Wr$M%>i#-GvX0d1!j z!?FNHgv4j;@|8#EJh2^gvw-3uGv|)T&PQCkyaq3Av3bo5<+X)GuiMc*swLwF?Mzt26;;R%&iy@8NkxVYNdmR5ll zDY~9C^zA~HrC<59@xTy3jA8E3@hzS6Z24%V{voPY8fpO_uxz&2QIqE7oHsF!_?bOw zy!OYZwJVGeolTZ8kf)QAl;u+#IKOitF5p5b3w%K#|?8X`QugcypS9+I+Bjt6-8uj8yW}QzF1p( z^g4HptOnv}efX?QpJexKfz#5hyevPp9EBP~4S^H21?^b1xduy3_r_80SHXSxJ>=69 zd+_+@ho2zrhVz3ZF6~*L8I6E=psI1Iu{_PIX=(sck?yCnNWN2 zFu>MaWJZMQqwRBrb-$%ta-OP{1f$OVKCT%8s1o`R)a7r?5mxX7E++1Wa&)cW>vyxJadL^w7(j7>BH!> zF6=@11lhTLrqA1-XHJ71=cf#_n1gjKNBjAIbB@oRD@6fTtNv!*h{ApxINXr<)P#pi zmJ$Y(vmQoQf-7}`q=fk^g?#tM!{kGmsz^s#NX zJEULZHuy$qhETD&WIIhq`-Yf~gkYX3RH!)68?Co2lpEc1~#* z6^=o6rqqcdD=bFo1DB5szX-n*Kv6)Ee}MM8A?o(L=X3WGvo?O3dO z{5&7Dolm22I0EJ7GsFWg7CW;JBQ)OYeFfgow>;oJ7N)tZ;BjmA{i&TxNFqdQHpt3nApVqLggvIEAwvAc(lUjcaxQa z8X3~fL*&m72i0=bjaD6Ya94}xiA?T-87GXsUd?+0fqrq^6~G~T1!+bb{^;5S3}yit z8##VIIOd|a!b}W}?R%=#V{9<;uR%`&1po<-&Fupm1-_`8^$^k?11z=rzYDy9#_k3P zwf;!_r3zo0VTQD0<}T$kWDb+J?n;4-lPzD!Ck^j{tAVGY7A|eu#}{dpm%wJN87zT2 ztKROpiVEx%~_p8V&iUW}QzWawinazy`%aa*T zW5QTmy3202NqqoPBYWZ9@U}%v6Ey9RH^L}`J)}n}$$p)q`YrBES(DubqVUFh-NDwt zg|gU%n+zrlzK&o)ah_WO>ioCO^T0+qAJ5xWf1cY+VlIR8%Rrm7?jqC5tRjSzOCXus zh0;yYNi6DS>UxHXis%v6^WDtUa4~3G+e#1<>tUE70ohgi!Scz~v3d)EcHx}oXnmJ;0X_&ns2Ntj_@{igp4H``_s+%DJS_v;k9?C>wln{8JRkveo9e{s zh)<5s|L%9ijV7r%Qk3(X!c&*u;T%EjGd-8Vr!O&2K4$g{;=(nuoc4S|!-q5cn`tD& zyVt3%7r$u(c;n2RE%prS=Red4LEzeo=vYDS?{8< z-sY8jwDyDO0K*S8Es5Q$RajCbaIkyoJnz;Fu8x_>*gFR>PM=HeIHQZ>7b5kUFGvsu zug*^`p+bH(1_BoP?w;&)q|qEX8urhb5>Z{0?RSM2T_uL<--s7%WFR60+4FIvT8}7a zZ*;xYQ24*S_bIM?b$MF#&>6e6nz%D9~f=_K}6(R42O4v?@>>c^ovYWZr= znp{V*3arT8b=Hffe>~f!vA=0Et)Bc)N)Ry>To<4|fM!F7O9>L*RHJ~dxxcQhJ~VaI zV=XyoI6=IBt>OCMl*XlfYq-D)^vjV#K-zZ=zgtc*9(gaC7XE@I=s$yUWAyDy3y%Tw z#iTCKZ&(j*z(L0q=v?)~Z`$1=cJTRK?6#pG8uBWRhLP`Tf4ovh^*eNI7f096ei9ffGDl)_%Ee9C(QPs|$&q3Hl$z5Irf6g)Z4fy*#cBD^bdP_SL>z$?7M5F0t9j%OiFm?4qxkm?Lt+{pPX7 zF&`ZC!(d7Vhtj$MrsK2I^1N}&+=%AoYmOJeOT>yVoO2>E`XWH%xAyatd-tVLhEDWX z?;q8W9`bs_tY(S2!oE96iJ!X{az#Nzdf5ipc=ioAJOl$2-`_%xcHZl+{zZ(ImEM7g zP-0>Y@d}Hg?cXe|V9BpfaEM;khu_;s-2!BPX(g+0*$N>a`FEd(gvYsd{-e?AZ}7FV z!;3`tAY_h6&k;!H#1H$qvsG&_3CjFYur74hv)d(*Kz8bWVRyPjDvE+^-Cd@}7IcRN z3J3k4aL!s6U6=@F^_Xw{=`C!a4#Mx`J3S;?1n9;#v9JQBe_IDWwQUcsW|r4pQH1hW zgsJpK#`%hQW0w#(?p}v9#T?oBdd)(-BwgHNkhRVLdX3eTocHVSh4p@4y)b$*f)~LN zDW8eOu%1O|IOJqN_p+Hz80JE3S#=T0pa&mOxNhy?iWWJ({QcP12yxR0Q&==pNLG?| zI#VgBbB#CZOxeN(u55T+ziOQiO9B?(DTx=z&_P#F=8Yc1f zbDjGoVqQ#l=$MX&KI6>BSrJERPl_!zCX++CQMKbMHjE;>!AlU?ca-$$q50N1gA%{x zn>@w+BOLtKo3yA2s(qeW$&BJ)gHq|~MFRJ#A>j_?mU8SLg)m^V^33-;!5REK3VLK@pyX}gh-~xnw z+My3A8`- z!t)}Cx5fcRgOhAWfR5v_%(q8{S?5+6%))MZF!vcR!ARkmqwHIceBI@}6B-8-bwX2U zivD#{0#+}}Di=Gq0#3K}E10Dh6LqH7?m?=9l_@Z|Nz2H&qa)Ytdn==znXZ`GH{1%m z$}-hpqVd6`wL8r1qM&jU*Lg}1L;ihP_f){#lG7aTXZ4W9RHWUuuw9ta<)bruM#eS6)g?~O9f=Z-vYivB~Q zl8ixm{6}UWxB`6}A+L(e1>HS^RgTLHcdWoQ{;9gYN7PvJSPfE3LwFO=9r#X)nZRU? zx!j}Z)y5KkDiTNtz9YJG+5!cIrP?}%Q8f*(`@q;vY3h*vWv;aV@58{j#zDM z^h=flM)K@niHjd$Wd)f2rq$kB>1xX{CeisBA&&wEI>$8Lf1$* zr@~|SC#5!3Z61NmT;W2;zD6W+{e7$)L6|Bll`Z{>Vnl5X3C2$gu#)hdNHL;&_T=B1~~Zam{m zLYN)e0#W!6g&ANo@qYa0je&!)mnT*ZiMIS+xCe zAi2aXp&hpIm2$m$26`r_VyTJTtB|MPZSmjQgm;?%(kARB_LBwQ8Y9GiGj4m=PQtHp z?y&cZhHaP1H8`$>YtYao@?o$Xotd$pMUNkIb)$QujRe-#_1?~nQ9>~NMacMD;E=|M zcxIQOX7Ayr1aRZs=4OMF`y)2{9c6tI1pnaPHaowbNgpb^5XSQG)WcL|CEEcgqlgO3d_OqH;gjD!MWH zcn`tdw&td16%CvKWd%VAwS)lMMdFeIznC1_cV|!{&5tpt%=dH6lLf&}F`Z2lqs}TO zkNNkDo3raSj12k@Ev<9(m{o9U9(cY_&gOo=XDaD7u8pR4+|!{E~Zb$!U@+h`G1|-O}BhM*&dkDZ+Ydm^D!`V$++*GylW@0jt93{ zKNjleMy}n`AX)C%!B6!wG^f#ovsf_yQAPmrrC`^D`i?)eXKA zQEi+{b?7@?m61i)reXDQcUj*w3(lUb7ksl@!s2yfeQIOC=R#kqsfydLlyBKO7%d{* zI6gU^?3O@$$Q}S%pgQI9rq=%03zPEN=F7d!f7Y~p9CsemT4a%Ms@8H2RwrZqLso~{ z!zLwNtfF-h(rt8_bY{K*QcxDdo7cq>i*6thk(2n%wo``oB`qvbn`cK9{kg zW#mJ>WtxjROy^*&g0nwJ7Dwo;ssHTZQ~7e{M>bA_$yM##KYBZlg)j3Y=Ib*wLbvTU zmC{xf&^Z(gY7VTT@((Z(*{62#$)uk6U~#k$rhVuzf+MheuK#El%h&jU>(uc7(w+W) ms8$6>kp8<^mCEn@4HDEBr75#$`vhooAtXg*MJk2#{Qe8# byte), all the bits in the destination byte are set to 1 if the byte is negative, otherwise the bits in the destination byte are set to 0. punpckX can be used as above to perform the sign extension. If the byte is negative the corresponding byte is 0b11111111 and otherwise it’s 0x00000000. Interleaving the byte value with the output of pcmpgtb performs a sign extension to word as a result. - -```assembly -pxor m2, m2 ; zero out m2 - -movu m0, [srcq] -movu m1, m0 ; make a copy of m0 in m1 - -pcmpgtb m2, m0 -punpcklbw m0, m2 -punpckhbw m1, m2 -``` - -As you can see there is an extra instruction compared to the unsigned case. - -**Packing** - -packuswb (pack unsigned word to byte) and packsswb lets you go from word to byte. It lets you interleave two SIMD registers containing words into one SIMD register with a byte. Note that if the values exceed the byte range, they will be saturated (i.e clamped at the largest value). - -**Shuffles** - -Shuffles, also known as permutes, are arguably the most important instruction in video processing and pshufb (packed shuffle bytes), available in SSSE3, is the most important variant. - -For each byte the corresponding source byte is used as an index of the destination register, except when the MSB is set the destination byte is zeroed. It’s analogous to the following C code (although in SIMD all 16 loop iterations happen in parallel): - -```c -for(int i = 0; i < 16; i++) { - if(src[i] & 0x80) - dst[i] = 0; - else - dst[i] = dst[src[i]] -} -``` -Here’s a simple assembly example: - -```assembly -SECTION_DATA 64 - -shuffle_mask: db 4, 3, 1, 2, -1, 2, 3, 7, 5, 4, 3, 8, 12, 13, 15, -1 - -section .text - -movu m0, [srcq] -movu m1, [shuffle_mask] -pshufb m0, m1 ; shuffle m0 based on m1 -``` - -Note that -1 for easy reading is used as the shuffle index to zero out the output byte: -1 as a byte is the 0b11111111 bitfield (two’s complement), and thus the MSB (0x80) is set. - -[image1]: \ No newline at end of file