[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"sanity-XxsKA6fgNvqalCOPYXKPenEjRKXhhCems2OKis1g0P4":3,"sanity-htkWc6AII2EkGNiZpjl-xYdGsetN8f7pNYATNyn7yjA":1237},{"data":4,"sourceMap":-1},{"latestPodcast":5,"latestReleases":14,"post":39,"recent":1212},[6],{"_id":7,"publishedAt":8,"slug":9,"sponsored":12,"title":13},"f83eb5f0-1237-487f-84d8-f7abf2318c39","2026-06-25T07:40:00.000Z",{"_type":10,"current":11},"slug","code-isnt-causing-your-production-failures",null,"Code isn’t the only thing causing your production failures",[15,21,27,33],{"_id":16,"publishedAt":17,"slug":18,"title":20},"eb5b66eb-9410-4329-83bb-22bbff39402a","2026-04-28T13:00:00.000Z",{"_type":10,"current":19},"turn-scattered-knowledge-into-trusted-intelligence","Turning scattered knowledge into trusted intelligence: Stack Internal 2026.3",{"_id":22,"publishedAt":23,"slug":24,"title":26},"369c2401-b62e-4a37-8ff8-bf603023ecad","2026-03-02T15:03:00.988Z",{"_type":10,"current":25},"what-s-new-at-stack-overflow-march-2026","What’s new at Stack Overflow: March 2026",{"_id":28,"publishedAt":29,"slug":30,"title":32},"5e9053a4-07ea-447c-91ea-29e0b6228537","2026-02-02T15:00:00.000Z",{"_type":10,"current":31},"what-s-new-at-stack-overflow-february-2026","What’s new at Stack Overflow: February 2026",{"_id":34,"publishedAt":35,"slug":36,"title":38},"a1b538eb-a8a6-46d0-80a1-ac70ec9bb935","2026-01-05T10:00:00.000-05:00",{"_type":10,"current":37},"what-s-new-at-stack-overflow-january-2026","What’s new at Stack Overflow: January 2026",{"_createdAt":40,"_id":41,"_rev":42,"_type":43,"_updatedAt":44,"author":45,"body":62,"comments":1181,"dateUrl":1182,"excerpt":1183,"image":1184,"legacyBody":1187,"product":12,"publishedAt":1190,"slug":1191,"sponsored":12,"tags":1193,"title":1211,"visible":1181},"2023-05-25T09:39:15Z","wp-post-16227","dgl3SCUzppW3U2LvCoSnq8","blogPost","2023-07-13T14:55:40Z",[46],{"_createdAt":47,"_id":48,"_rev":49,"_type":50,"_updatedAt":51,"avatar":52,"bio":57,"employee":58,"name":59,"slug":60},"2023-05-23T16:27:18Z","wp-author-cap-15407","07ZbrKPSUrjrV4wQ6fDpaa","blogAuthor","2023-06-20T15:05:10Z",{"_type":53,"asset":54},"image",{"_ref":55,"_type":56},"image-4f1f21247201b2d06b15970e66f6733c712a9ce2-1200x799-jpg","reference","I've been working in the software industry since 2000. For the last 8 years I live in Montenegro, working remotely as an independent contractor. Current areas of work include CAD/CAM/CAE, embedded Linux, and GPGPU.","none","Konstantin",{"current":61},"konstantin",[63,74,83,91,99,178,186,226,256,265,289,292,318,341,357,389,397,405,424,454,462,503,508,516,524,548,556,564,572,601,608,616,620,627,635,651,659,675,683,691,710,734,737,748,751,781,799,807,815,823,831,839,847,855,863,882,890,898,917,947,966,985,1015,1019,1038,1046,1054,1062,1070,1099,1107,1115,1118,1173],{"_key":64,"_type":65,"children":66,"markDefs":72,"style":73},"c2cbea63f87d","block",[67],{"_key":68,"_type":69,"marks":70,"text":71},"c2cbea63f87d0","span",[],"When done right, supplementing C or C++ code with vector intrinsics is exceptionally good for performance. For the cases presented in this blog post, vectorization improved performance by a factor of 3 to 12.",[],"normal",{"_key":75,"_type":65,"children":76,"markDefs":81,"style":82},"c597a654895c",[77],{"_key":78,"_type":69,"marks":79,"text":80},"c597a654895c0",[],"Introduction",[],"h1",{"_key":84,"_type":65,"children":85,"markDefs":90,"style":73},"cbac2f456d42",[86],{"_key":87,"_type":69,"marks":88,"text":89},"cbac2f456d420",[],"Many developers write software that’s performance sensitive. After all, that’s one of the major reasons why we still pick C or C++ language these days.",[],{"_key":92,"_type":65,"children":93,"markDefs":98,"style":73},"5458a4e281c2",[94],{"_key":95,"_type":69,"marks":96,"text":97},"5458a4e281c20",[],"All modern processors are actually vector under the hood. Unlike scalar processors, which process data individually, modern vector processors process one-dimensional arrays of data. If you want to maximize performance, you need to write code tailored to these vectors.",[],{"_key":100,"_type":65,"children":101,"markDefs":172,"style":73},"d7eceac75394",[102,106,111,116,120,124,128,132,137,141,145,149,152,156,159,163,168],{"_key":103,"_type":69,"marks":104,"text":105},"d7eceac753940",[],"Every time you write ",{"_key":107,"_type":69,"marks":108,"text":110},"d7eceac753941",[109],"code","float s = ",{"_key":112,"_type":69,"marks":113,"text":115},"d7eceac753942",[109,114],"em","a",{"_key":117,"_type":69,"marks":118,"text":119},"d7eceac753943",[109]," + ",{"_key":121,"_type":69,"marks":122,"text":123},"d7eceac753944",[109,114],"b",{"_key":125,"_type":69,"marks":126,"text":127},"d7eceac753945",[109],";",{"_key":129,"_type":69,"marks":130,"text":131},"d7eceac753946",[]," you’re leaving a lot of performance on the table. The processor could have added four float numbers to another four numbers, or even eight numbers to another eight numbers if that processor supports ",{"_key":133,"_type":69,"marks":134,"text":136},"d7eceac753947",[135],"c3af6f87d369","AVX",{"_key":138,"_type":69,"marks":139,"text":140},"d7eceac753948",[],". Similarly, when you write ",{"_key":142,"_type":69,"marks":143,"text":144},"d7eceac753949",[109],"int i = ",{"_key":146,"_type":69,"marks":147,"text":148},"d7eceac7539410",[109,114],"j",{"_key":150,"_type":69,"marks":151,"text":119},"d7eceac7539411",[109],{"_key":153,"_type":69,"marks":154,"text":155},"d7eceac7539412",[109,114],"k",{"_key":157,"_type":69,"marks":158,"text":127},"d7eceac7539413",[109],{"_key":160,"_type":69,"marks":161,"text":162},"d7eceac7539414",[]," to add 2 integer numbers, you could have added four or eight numbers instead, with corresponding ",{"_key":164,"_type":69,"marks":165,"text":167},"d7eceac7539415",[166],"311575a685d8","SSE2",{"_key":169,"_type":69,"marks":170,"text":171},"d7eceac7539416",[]," or AVX2 instructions.",[173,176],{"_key":135,"_type":174,"href":175,"reference":12},"link","https://en.wikipedia.org/wiki/Advanced_Vector_Extensions",{"_key":166,"_type":174,"href":177,"reference":12},"https://en.wikipedia.org/wiki/SSE2",{"_key":179,"_type":65,"children":180,"markDefs":185,"style":73},"1ecabb727882",[181],{"_key":182,"_type":69,"marks":183,"text":184},"1ecabb7278820",[],"Language designers, compiler developers, and other smart people have been trying for many years to compile scalar code into vector instructions in a way that would leverage the performance potential. So far, none of them have completely succeeded, and I’m not convinced it’s possible.",[],{"_key":187,"_type":65,"children":188,"markDefs":220,"style":73},"9c5b4cbe70d6",[189,193,198,202,207,211,216],{"_key":190,"_type":69,"marks":191,"text":192},"9c5b4cbe70d60",[],"One approach to leverage vector hardware are SIMD intrinsics, available in all modern C or C++ compilers. SIMD ",{"_key":194,"_type":69,"marks":195,"text":197},"9c5b4cbe70d61",[196],"d33ce85c3eac","stands for",{"_key":199,"_type":69,"marks":200,"text":201},"9c5b4cbe70d62",[]," “single Instruction, multiple data”. ",{"_key":203,"_type":69,"marks":204,"text":206},"9c5b4cbe70d63",[205],"1209dd536830","SIMD",{"_key":208,"_type":69,"marks":209,"text":210},"9c5b4cbe70d64",[]," instructions are available on many platforms, there’s a high chance your smartphone has it too, through the architecture extension ",{"_key":212,"_type":69,"marks":213,"text":215},"9c5b4cbe70d65",[214],"d3b148b31a76","ARM NEON",{"_key":217,"_type":69,"marks":218,"text":219},"9c5b4cbe70d66",[],". This article focuses on PCs and servers running on modern AMD64 processors.",[221,223,224],{"_key":196,"_type":174,"href":222,"reference":12},"https://en.wikipedia.org/wiki/SIMD",{"_key":205,"_type":174,"href":222,"reference":12},{"_key":214,"_type":174,"href":225,"reference":12},"https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)",{"_key":227,"_type":65,"children":228,"markDefs":251,"style":73},"12b40c431d24",[229,233,238,242,247],{"_key":230,"_type":69,"marks":231,"text":232},"12b40c431d240",[],"Even with the focus on AMD64 platform, the topic is way too broad for a single blog post. Modern SIMD instructions were introduced to Pentium processors with the release of Pentium 3 in 1999 (that instruction set is ",{"_key":234,"_type":69,"marks":235,"text":237},"12b40c431d241",[236],"2f0add0c40fe","SSE",{"_key":239,"_type":69,"marks":240,"text":241},"12b40c431d242",[],", nowadays it’s sometimes called SSE 1), more of them have been added since then. For a more in-depth introduction, you can read ",{"_key":243,"_type":69,"marks":244,"text":246},"12b40c431d243",[245],"e77d3393ea09","my other article on the subject.",{"_key":248,"_type":69,"marks":249,"text":250},"12b40c431d244",[]," Unlike this blog post, that one doesn’t have practical problems nor benchmarks, instead it tries to provide an overview of what’s available.",[252,254],{"_key":236,"_type":174,"href":253,"reference":12},"https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions",{"_key":245,"_type":174,"href":255,"reference":12},"http://const.me/articles/simd/simd.pdf",{"_key":257,"_type":65,"children":258,"markDefs":263,"style":264},"8a2dd6de5ee2",[259],{"_key":260,"_type":69,"marks":261,"text":262},"8a2dd6de5ee20",[],"What are vector intrinsics?",[],"h2",{"_key":266,"_type":65,"children":267,"markDefs":288,"style":73},"1ca1214e9f2a",[268,272,276,280,284],{"_key":269,"_type":69,"marks":270,"text":271},"1ca1214e9f2a0",[],"To a programmer, intrinsics look just like regular library functions; you include the relevant header, and you can use the intrinsic. To add four float numbers to another four numbers, use the ",{"_key":273,"_type":69,"marks":274,"text":275},"1ca1214e9f2a1",[114,109],"_mm_add_ps",{"_key":277,"_type":69,"marks":278,"text":279},"1ca1214e9f2a2",[]," intrinsic in your code. In the compiler-provided header declaring that intrinsic, ",{"_key":281,"_type":69,"marks":282,"text":283},"1ca1214e9f2a3",[109],"\u003Cxmmintrin.h>",{"_key":285,"_type":69,"marks":286,"text":287},"1ca1214e9f2a4",[],", you’ll find this declaration (Assuming you’re using VC++ compiler. In GCC you’ll see something different, which provides the same API to a user.):",[],{"_key":290,"_type":109,"code":291,"markDefs":12},"21b4b8bc621c","extern __m128 _mm_add_ps( __m128 _A, __m128 _B );",{"_key":293,"_type":65,"children":294,"markDefs":315,"style":73},"8262d6166150",[295,299,302,306,311],{"_key":296,"_type":69,"marks":297,"text":298},"8262d61661500",[],"But unlike library functions, intrinsics are implemented directly in compilers. The above",{"_key":300,"_type":69,"marks":301,"text":275},"8262d61661501",[109,114],{"_key":303,"_type":69,"marks":304,"text":305},"8262d61661502",[]," SSE intrinsic typically1 compiles into a single instruction, ",{"_key":307,"_type":69,"marks":308,"text":310},"8262d61661503",[309],"f8db70b85116","addps",{"_key":312,"_type":69,"marks":313,"text":314},"8262d61661504",[],". For the time it takes CPU to call a library function, it might have completed a dozen of these instructions.",[316],{"_key":309,"_type":174,"href":317,"reference":12},"https://www.felixcloutier.com/x86/addps",{"_key":319,"_type":65,"children":320,"markDefs":340,"style":73},"8314da446814",[321,325,329,332,336],{"_key":322,"_type":69,"marks":323,"text":324},"8314da4468140",[],"1(That instruction can fetch one of the arguments from memory, but not both. If you call it in a way so the compiler has to load both arguments from memory, like this ",{"_key":326,"_type":69,"marks":327,"text":328},"8314da4468141",[109],"__m128 sum = ",{"_key":330,"_type":69,"marks":331,"text":275},"8314da4468142",[109,114],{"_key":333,"_type":69,"marks":334,"text":335},"8314da4468143",[109],"( *p1, *p2 );",{"_key":337,"_type":69,"marks":338,"text":339},"8314da4468144",[]," the compiler will emit two instructions: the first one to load an argument from memory into a register, the second one to add the four values.)",[],{"_key":342,"_type":65,"children":343,"markDefs":356,"style":73},"4eb54d1691f5",[344,348,352],{"_key":345,"_type":69,"marks":346,"text":347},"4eb54d1691f50",[],"The ",{"_key":349,"_type":69,"marks":350,"text":351},"4eb54d1691f51",[109],"__m128",{"_key":353,"_type":69,"marks":354,"text":355},"4eb54d1691f52",[]," built-in data type is a vector of four floating point numbers; 32 bits each, 128 bits in total. CPUs have wide registers for that data type, 128 bits per register. Since AVX was introduced in 2011, in current PC processors these registers are 256 bits wide, each one of them can fit eight float values, four double-precision float values, or a large number of integers, depending on their size.",[],{"_key":358,"_type":65,"children":359,"markDefs":388,"style":73},"927c845bb0c4",[360,364,368,372,376,380,384],{"_key":361,"_type":69,"marks":362,"text":363},"927c845bb0c40",[],"Source code that contains sufficient amounts of vector intrinsics or embeds their assembly equivalents is called manually vectorized code. Modern compilers and libraries already implement a lot of stuff with them using intrinsics, assembly, or a combination of the two. For example, some implementations of the ",{"_key":365,"_type":69,"marks":366,"text":367},"927c845bb0c41",[109],"memset",{"_key":369,"_type":69,"marks":370,"text":371},"927c845bb0c42",[],", ",{"_key":373,"_type":69,"marks":374,"text":375},"927c845bb0c43",[109],"memcpy",{"_key":377,"_type":69,"marks":378,"text":379},"927c845bb0c44",[],", or ",{"_key":381,"_type":69,"marks":382,"text":383},"927c845bb0c45",[109],"memmove",{"_key":385,"_type":69,"marks":386,"text":387},"927c845bb0c46",[]," standard C library routines use SSE2 instructions for better throughput. Yet outside of niche areas like high-performance computing, game development, or compiler development, even very experienced C and C++ programmers are largely unfamiliar with SIMD intrinsics.",[],{"_key":390,"_type":65,"children":391,"markDefs":396,"style":73},"91ef338d668b",[392],{"_key":393,"_type":69,"marks":394,"text":395},"91ef338d668b0",[],"To help demonstrate, I’m going to present three practical problems and discuss how SIMD helped.",[],{"_key":398,"_type":65,"children":399,"markDefs":404,"style":82},"e07f83975119",[400],{"_key":401,"_type":69,"marks":402,"text":403},"e07f839751190",[],"Image processing: grayscale",[],{"_key":406,"_type":65,"children":407,"markDefs":421,"style":73},"912da6733d66",[408,412,417],{"_key":409,"_type":69,"marks":410,"text":411},"912da6733d660",[],"Suppose that we need to write a function that converts RGB image to grayscale. ",{"_key":413,"_type":69,"marks":414,"text":416},"912da6733d661",[415],"ef5285d35ee1","Someone asked this very question recently",{"_key":418,"_type":69,"marks":419,"text":420},"912da6733d662",[],".",[422],{"_key":415,"_type":174,"href":423,"reference":12},"https://stackoverflow.com/q/58881359/126995",{"_key":425,"_type":65,"children":426,"markDefs":449,"style":73},"e2c11bd9b3b8",[427,431,436,440,445],{"_key":428,"_type":69,"marks":429,"text":430},"e2c11bd9b3b80",[],"Many practical applications need code like this. For example, when you compress raw image data to JPEG or video data to H.264 or H.265, the first step of the compression is quite similar. Specifically, compressors convert RGB pixels into ",{"_key":432,"_type":69,"marks":433,"text":435},"e2c11bd9b3b81",[434],"5aef712608e4","YUV",{"_key":437,"_type":69,"marks":438,"text":439},"e2c11bd9b3b82",[]," color space. The exact color space is defined in the specs of these formats—for video, it’s often ",{"_key":441,"_type":69,"marks":442,"text":444},"e2c11bd9b3b83",[443],"a780cbcb82dd","ITU-R BT.709",{"_key":446,"_type":69,"marks":447,"text":448},"e2c11bd9b3b84",[]," these days See section 3, “Signal format” of that spec.",[450,452],{"_key":434,"_type":174,"href":451,"reference":12},"https://en.wikipedia.org/wiki/YUV",{"_key":443,"_type":174,"href":453,"reference":12},"https://www.itu.int/rec/R-REC-BT.709/en",{"_key":455,"_type":65,"children":456,"markDefs":461,"style":264},"c8f1c7c40feb",[457],{"_key":458,"_type":69,"marks":459,"text":460},"c8f1c7c40feb0",[],"Performance comparison",[],{"_key":463,"_type":65,"children":464,"markDefs":496,"style":73},"1153bb53f27d",[465,469,474,478,483,487,492],{"_key":466,"_type":69,"marks":467,"text":468},"1153bb53f27d0",[],"I’ve implemented a few versions, vectorized and not, and tested them with random images. Mydesktop has an ",{"_key":470,"_type":69,"marks":471,"text":473},"1153bb53f27d1",[472],"fa6003f1e900","AMD Ryzen 5 3600",{"_key":475,"_type":69,"marks":476,"text":477},"1153bb53f27d2",[]," plugged in, my laptop has an ",{"_key":479,"_type":69,"marks":480,"text":482},"1153bb53f27d3",[481],"3e85a3c0d9d9","Intel i3-6157U",{"_key":484,"_type":69,"marks":485,"text":486},"1153bb53f27d4",[]," soldered. ",{"_key":488,"_type":69,"marks":489,"text":491},"1153bb53f27d5",[490],"3b5fbc2d4db3","WSL",{"_key":493,"_type":69,"marks":494,"text":495},"1153bb53f27d6",[]," column has results from the same desktop, but for a Linux binary built with GCC 7.4. The three rightmost columns of the table contain time in milliseconds (best of five runs), for an image of 3840x2160 pixels.",[497,499,501],{"_key":472,"_type":174,"href":498,"reference":12},"https://www.amd.com/en/products/cpu/amd-ryzen-5-3600",{"_key":481,"_type":174,"href":500,"reference":12},"https://ark.intel.com/content/www/us/en/ark/products/96484/intel-core-i3-6157u-processor-3m-cache-2-40-ghz.html",{"_key":490,"_type":174,"href":502,"reference":12},"https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux",{"_key":504,"_type":53,"alt":12,"asset":505,"caption":507,"markDefs":12},"817bf7aedfeb",{"_ref":506,"_type":56},"image-115a190c2a3b97e1927ee743ca947af050b786ed-635x276-png","",{"_key":509,"_type":65,"children":510,"markDefs":515,"style":264},"830e86658a71",[511],{"_key":512,"_type":69,"marks":513,"text":514},"830e86658a710",[],"Observations",[],{"_key":517,"_type":65,"children":518,"markDefs":523,"style":73},"2a64481ea3a4",[519],{"_key":520,"_type":69,"marks":521,"text":522},"2a64481ea3a40",[],"Vectorized versions are three to eight times faster than scalar code. On the laptop, the scalar version is likely too slow to handle 60 FPS video of frames of this size, while the performance of vectorized code is OK for that.",[],{"_key":525,"_type":65,"children":526,"markDefs":547,"style":73},"48d8ad551471",[527,531,535,539,543],{"_key":528,"_type":69,"marks":529,"text":530},"48d8ad5514710",[],"The best way to vectorize that particular algorithm appears to be fixed-point 16-bit math. Vector registers fit twice as many 16-bit integers as 32-bit floats, allowing to process twice as many pixels in parallel spending approximately the same time. On my desktop, ",{"_key":532,"_type":69,"marks":533,"text":534},"48d8ad5514711",[109],"_mm_mul_ps",{"_key":536,"_type":69,"marks":537,"text":538},"48d8ad5514712",[]," SSE 1 intrinsic (multiplies four floats from 128-bit registers) has 3 cycles latency, and 0.5 cycles throughput. ",{"_key":540,"_type":69,"marks":541,"text":542},"48d8ad5514713",[109],"_mm_mulhi_epu16",{"_key":544,"_type":69,"marks":545,"text":546},"48d8ad5514714",[]," SSE 2 intrinsic (multiplies eight fixed-point numbers from 128-bit registers) has the same 3 cycles latency and 1 cycle throughput.",[],{"_key":549,"_type":65,"children":550,"markDefs":555,"style":73},"930edd46be43",[551],{"_key":552,"_type":69,"marks":553,"text":554},"930edd46be430",[],"In my experience, this outcome is common for image and video processing on CPU, not just for this particular grayscale problem.",[],{"_key":557,"_type":65,"children":558,"markDefs":563,"style":73},"314ff93d789f",[559],{"_key":560,"_type":69,"marks":561,"text":562},"314ff93d789f0",[],"On the desktop, upgrading from SSE to AVX—with twice as wide SIMD vectors—only improved performance a tiny bit. On the laptop it helped substantially. A likely reason for that is the RAM bandwidth bottleneck on the desktop. This is common, too, over the course of many years, CPU performance has been growing somewhat faster than memory bandwidth.",[],{"_key":565,"_type":65,"children":566,"markDefs":571,"style":82},"05bf5377c021",[567],{"_key":568,"_type":69,"marks":569,"text":570},"05bf5377c0210",[],"General math: dot product",[],{"_key":573,"_type":65,"children":574,"markDefs":596,"style":73},"888aa644692d",[575,579,584,588,593],{"_key":576,"_type":69,"marks":577,"text":578},"888aa644692d0",[],"Write a function to compute a dot product of two float vectors. ",{"_key":580,"_type":69,"marks":581,"text":583},"888aa644692d1",[582],"6bf3d3b14147","Here’s a relevant Stack Overflow question",{"_key":585,"_type":69,"marks":586,"text":587},"888aa644692d2",[],". A popular application for dot products these days is ",{"_key":589,"_type":69,"marks":590,"text":592},"888aa644692d3",[591],"c691876df432","machine learning",{"_key":594,"_type":69,"marks":595,"text":420},"888aa644692d4",[],[597,599],{"_key":582,"_type":174,"href":598,"reference":12},"https://stackoverflow.com/q/59494745/126995",{"_key":591,"_type":174,"href":600,"reference":12},"https://stats.stackexchange.com/a/291760/",{"_key":602,"_type":65,"children":603,"markDefs":607,"style":264},"4e19129f395f",[604],{"_key":605,"_type":69,"marks":606,"text":460},"4e19129f395f0",[],[],{"_key":609,"_type":65,"children":610,"markDefs":615,"style":73},"a488d481c86e",[611],{"_key":612,"_type":69,"marks":613,"text":614},"a488d481c86e0",[],"I didn’t want to bottleneck on memory again, so I’ve made a test that computes a dot product of 256k-long vectors, taking 1MB RAM each. That amount of data fits in processor caches on both computers I’m using for benchmarks: the desktop has a 3MB L2 cache and a 32MB L3 cache, the laptop has a 3MB L3 cache and a 64MB L4 cache. The three rightmost columns are microseconds (µs), best of ten runs.",[],{"_key":617,"_type":53,"alt":12,"asset":618,"caption":507,"markDefs":12},"a8ec5e3a750a",{"_ref":619,"_type":56},"image-4da6d42692c0a3d67c5ac792083394ffa2c94352-633x349-png",{"_key":621,"_type":65,"children":622,"markDefs":626,"style":264},"dfccfe0ae62d",[623],{"_key":624,"_type":69,"marks":625,"text":514},"dfccfe0ae62d0",[],[],{"_key":628,"_type":65,"children":629,"markDefs":634,"style":73},"9d3c873160e8",[630],{"_key":631,"_type":69,"marks":632,"text":633},"9d3c873160e80",[],"Best versions are 5-12 times faster than scalar code.",[],{"_key":636,"_type":65,"children":637,"markDefs":650,"style":73},"3778ff3de395",[638,642,646],{"_key":639,"_type":69,"marks":640,"text":641},"3778ff3de3950",[],"The best SSE1-only version, ",{"_key":643,"_type":69,"marks":644,"text":645},"3778ff3de3951",[109],"SseVertical4",{"_key":647,"_type":69,"marks":648,"text":649},"3778ff3de3952",[],", delivered close performance to AVX+FMA. A likely reason for that is memory bandwidth. The source data is in the cache, so the bandwidth itself is very high. However, CPUs can only do a couple loads per cycle. The code reads from two input arrays at once and is likely to hit that limit.",[],{"_key":652,"_type":65,"children":653,"markDefs":658,"style":73},"62c85d415213",[654],{"_key":655,"_type":69,"marks":656,"text":657},"62c85d4152130",[],"When built with VC++, single accumulator non-FMA SSE and especially AVX versions performed surprisingly well. I’ve looked at the disassembly. The compiler managed to hide some latency with instructions reordering. The code computes the product, increments the pointers, adds product to the accumulator, and finally tests for loop exit condition. This way, vector and scalar instructions are interleaved, hiding the latency of both. To an extent: the four-accumulators version is still faster.",[],{"_key":660,"_type":65,"children":661,"markDefs":674,"style":73},"2ca586a80d16",[662,666,670],{"_key":663,"_type":69,"marks":664,"text":665},"2ca586a80d160",[],"The GCC-built scalar version is quite slow. This might be caused by my compiler options in ",{"_key":667,"_type":69,"marks":668,"text":669},"2ca586a80d161",[109],"CMakeLists.txt",{"_key":671,"_type":69,"marks":672,"text":673},"2ca586a80d162",[],". I’m not sure they’re good enough, because for the last few years, I only built Linux software running on ARM devices.",[],{"_key":676,"_type":65,"children":677,"markDefs":682,"style":264},"daf91646896b",[678],{"_key":679,"_type":69,"marks":680,"text":681},"daf91646896b0",[],"Why multiple accumulators?",[],{"_key":684,"_type":65,"children":685,"markDefs":690,"style":73},"21925f72bd4d",[686],{"_key":687,"_type":69,"marks":688,"text":689},"21925f72bd4d0",[],"Data dependencies is the main thing I’d like to illustrate with this example.",[],{"_key":692,"_type":65,"children":693,"markDefs":707,"style":73},"067c127dfcae",[694,698,703],{"_key":695,"_type":69,"marks":696,"text":697},"067c127dfcae0",[],"From a computer scientist point of view, dot product is a form of ",{"_key":699,"_type":69,"marks":700,"text":702},"067c127dfcae1",[701],"6270b297dbec","reduction",{"_key":704,"_type":69,"marks":705,"text":706},"067c127dfcae2",[],".The algorithm needs to process large input vectors, and compute just a single value. When the computations are fast (like in this case, multiplying floats from sequential blocks of memory is very fast), the throughput is often limited by latency of the reduce operation.",[708],{"_key":701,"_type":174,"href":709,"reference":12},"https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)",{"_key":711,"_type":65,"children":712,"markDefs":733,"style":73},"e04c4ee4f7bb",[713,717,721,725,729],{"_key":714,"_type":69,"marks":715,"text":716},"e04c4ee4f7bb0",[],"Let’s compare code of two specific versions, ",{"_key":718,"_type":69,"marks":719,"text":720},"e04c4ee4f7bb1",[109],"AvxVerticalFma",{"_key":722,"_type":69,"marks":723,"text":724},"e04c4ee4f7bb2",[]," and ",{"_key":726,"_type":69,"marks":727,"text":728},"e04c4ee4f7bb3",[109],"AvxVerticalFma2",{"_key":730,"_type":69,"marks":731,"text":732},"e04c4ee4f7bb4",[],". The former has the following main loop:",[],{"_key":735,"_type":109,"code":736,"markDefs":12},"15ce10e5990f","for( ; p1 \u003C p1End; p1 += 8, p2 += 8 )\n{\n          const __m256 a = _mm256_loadu_ps( p1 );\n          const __m256 b = _mm256_loadu_ps( p2 );\n          acc = _mm256_fmadd_ps( a, b, acc ); // Update the only accumulator\n}\n",{"_key":738,"_type":65,"children":739,"markDefs":747,"style":73},"4f10c728b64e",[740,743],{"_key":741,"_type":69,"marks":742,"text":728},"4f10c728b64e0",[109],{"_key":744,"_type":69,"marks":745,"text":746},"4f10c728b64e1",[]," version runs following code:",[],{"_key":749,"_type":109,"code":750,"markDefs":12},"672f67416ea6","for( ; p1 \u003C p1End; p1 += 16, p2 += 16 )\n{\n          __m256 a = _mm256_loadu_ps( p1 );\n          __m256 b = _mm256_loadu_ps( p2 );\n          dot0 = _mm256_fmadd_ps( a, b, dot0 ); // Update the first accumulator\n          a = _mm256_loadu_ps( p1 + 8 );\n          b = _mm256_loadu_ps( p2 + 8 );\n          dot1 = _mm256_fmadd_ps( a, b, dot1 ); // Update the second accumulator\n}",{"_key":752,"_type":65,"children":753,"markDefs":778,"style":73},"eec329b1b8f6",[754,758,762,767,771,774],{"_key":755,"_type":69,"marks":756,"text":757},"eec329b1b8f60",[114,109],"_mm256_fmadd_ps",{"_key":759,"_type":69,"marks":760,"text":761},"eec329b1b8f61",[]," intrinsic computes (a*b)+c for arrays of eight float values, that instruction is part of ",{"_key":763,"_type":69,"marks":764,"text":766},"eec329b1b8f62",[765],"121cea43b568","FMA3",{"_key":768,"_type":69,"marks":769,"text":770},"eec329b1b8f63",[]," instruction set. The reason why ",{"_key":772,"_type":69,"marks":773,"text":728},"eec329b1b8f64",[109],{"_key":775,"_type":69,"marks":776,"text":777},"eec329b1b8f65",[]," version is almost 2x faster—deeper pipelining hiding the latency.",[779],{"_key":765,"_type":174,"href":780,"reference":12},"https://en.wikipedia.org/wiki/FMA_instruction_set#FMA3_instruction_set",{"_key":782,"_type":65,"children":783,"markDefs":797,"style":73},"9b47f9d510db",[784,788,793],{"_key":785,"_type":69,"marks":786,"text":787},"9b47f9d510db0",[],"When the processor submits an instruction, it needs values of the arguments. If some of them are not yet available, the processor waits for them to arrive. The tables on ",{"_key":789,"_type":69,"marks":790,"text":792},"9b47f9d510db1",[791],"8a05d3555e37","https://www.agner.org/",{"_key":794,"_type":69,"marks":795,"text":796},"9b47f9d510db2",[]," say on AMD Ryzen the latency of that FMA instruction is five cycles. This means once the processor started to execute that instruction, the result of the computation will only arrive five CPU cycles later. When the loop is running a single FMA instruction which needs the result computed by the previous loop iteration, that loop can only run one iteration in five CPU cycles.",[798],{"_key":791,"_type":174,"href":792,"reference":12},{"_key":800,"_type":65,"children":801,"markDefs":806,"style":73},"8f94de2bba1f",[802],{"_key":803,"_type":69,"marks":804,"text":805},"8f94de2bba1f0",[],"With two accumulators that limit is the same, five cycles. However, the loop body now contains two FMA instructions that don’t depend on each other. These two instructions run in parallel, and the code delivers twice the throughput on the desktop.",[],{"_key":808,"_type":65,"children":809,"markDefs":814,"style":73},"9f41ee50c845",[810],{"_key":811,"_type":69,"marks":812,"text":813},"9f41ee50c8450",[],"Not the case on the laptop, though. The laptop was clearly bottlenecked on something else, but I’m not sure what was that.",[],{"_key":816,"_type":65,"children":817,"markDefs":822,"style":264},"7ae6673f1a00",[818],{"_key":819,"_type":69,"marks":820,"text":821},"7ae6673f1a000",[],"Bonus chapter: precision issues",[],{"_key":824,"_type":65,"children":825,"markDefs":830,"style":73},"d5b3b05c8667",[826],{"_key":827,"_type":69,"marks":828,"text":829},"d5b3b05c86670",[],"Initially this benchmark used much larger vectors at 256 MB each. I quickly discovered the performance in that case was limited by memory bandwidth, with not much differentiation showing up in the results.",[],{"_key":832,"_type":65,"children":833,"markDefs":838,"style":73},"595963259426",[834],{"_key":835,"_type":69,"marks":836,"text":837},"5959632594260",[],"There was another interesting issue, however.",[],{"_key":840,"_type":65,"children":841,"markDefs":846,"style":73},"e8f7bedfc10a",[842],{"_key":843,"_type":69,"marks":844,"text":845},"e8f7bedfc10a0",[],"Besides just measuring the time, my test program prints the computed dot product. This is to make sure compilers don’t optimize away the code and to check that the result is the same across my two computers and 15 implementations.",[],{"_key":848,"_type":65,"children":849,"markDefs":854,"style":73},"b1d62233d64f",[850],{"_key":851,"_type":69,"marks":852,"text":853},"b1d62233d64f0",[],"I was surprised to see the scalar version printed 1.31E+7 while all other versions printed 1.67E+7. Initially, I thought it was a bug somewhere. I implemented a scalar version that uses double-precision accumulator, and sure enough, it printed 1.67E+7.",[],{"_key":856,"_type":65,"children":857,"markDefs":862,"style":73},"8c95f096e818",[858],{"_key":859,"_type":69,"marks":860,"text":861},"8c95f096e8180",[],"That whopping 20% error was caused by accumulation order. When a code adds a small float value to a large float value, a lot of precision is lost. An extreme example: when the first float value is larger than 8.4 million and the second value is smaller than 1.0, it won’t add anything at all. It will just return the larger of the two arguments!",[],{"_key":864,"_type":65,"children":865,"markDefs":879,"style":73},"e54b38c2a11a",[866,870,875],{"_key":867,"_type":69,"marks":868,"text":869},"e54b38c2a11a0",[],"Technically, you can often achieve a more precise result with a ",{"_key":871,"_type":69,"marks":872,"text":874},"e54b38c2a11a1",[873],"fa7f5dc3d82a","pairwise summation",{"_key":876,"_type":69,"marks":877,"text":878},"e54b38c2a11a2",[]," approach. My vectorized code doesn’t quite do that. Still, the four-accumulators AVX version accumulates 32 independent scalar values (four registers with eight floats each), which is a step in the same direction. When there are 64 million numbers to sum up, 32 independent accumulators helped a lot with the precision.",[880],{"_key":873,"_type":174,"href":881,"reference":12},"https://en.wikipedia.org/wiki/Pairwise_summation",{"_key":883,"_type":65,"children":884,"markDefs":889,"style":82},"7182e2171c97",[885],{"_key":886,"_type":69,"marks":887,"text":888},"7182e2171c970",[],"Image processing: flood fill",[],{"_key":891,"_type":65,"children":892,"markDefs":897,"style":73},"a05705c4b9ff",[893],{"_key":894,"_type":69,"marks":895,"text":896},"a05705c4b9ff0",[],"For the final part of the article, I’ve picked a slightly more complicated problem.",[],{"_key":899,"_type":65,"children":900,"markDefs":914,"style":73},"fff9b7518902",[901,905,910],{"_key":902,"_type":69,"marks":903,"text":904},"fff9b75189020",[],"For a layman, flood fill is what happens when you open an image in an editor, select the “paint bucket” tool, and click on the image. Mathematically, it’s a ",{"_key":906,"_type":69,"marks":907,"text":909},"fff9b75189021",[908],"074d3908a18e","connected-component labeling",{"_key":911,"_type":69,"marks":912,"text":913},"fff9b75189022",[]," operating on a regular 2D grid graph.",[915],{"_key":908,"_type":174,"href":916,"reference":12},"https://en.wikipedia.org/wiki/Connected-component_labeling",{"_key":918,"_type":65,"children":919,"markDefs":942,"style":73},"13360728f714",[920,924,929,933,938],{"_key":921,"_type":69,"marks":922,"text":923},"13360728f7140",[],"Unlike the first two problems, it’s not immediately clear how to vectorize this one. It’s not an ",{"_key":925,"_type":69,"marks":926,"text":928},"13360728f7141",[927],"1c98feb6172c","embarrassingly parallel",{"_key":930,"_type":69,"marks":931,"text":932},"13360728f7142",[]," problem. In fact, flood fill is quite hard to efficiently implement on ",{"_key":934,"_type":69,"marks":935,"text":937},"13360728f7143",[936],"6bf93c2a44d7","GPGPU",{"_key":939,"_type":69,"marks":940,"text":941},"13360728f7144",[],". Still, with some efforts, it’s possible to use SIMD in a way that significantly outperforms scalar code.",[943,945],{"_key":927,"_type":174,"href":944,"reference":12},"https://en.wikipedia.org/wiki/Embarrassingly_parallel",{"_key":936,"_type":174,"href":946,"reference":12},"https://en.wikipedia.org/wiki/General-purpose_computing_on_graphics_processing_units",{"_key":948,"_type":65,"children":949,"markDefs":963,"style":73},"c74f7e672d19",[950,954,959],{"_key":951,"_type":69,"marks":952,"text":953},"c74f7e672d190",[],"Because of the complexity, I only created two implementations. The first, the scalar version, is scanline fill, ",{"_key":955,"_type":69,"marks":956,"text":958},"c74f7e672d191",[957],"0f243efcca37","described in Wikipedia",{"_key":960,"_type":69,"marks":961,"text":962},"c74f7e672d192",[],". Not too optimized, but not particularly slow either.",[964],{"_key":957,"_type":174,"href":965,"reference":12},"https://en.wikipedia.org/wiki/Flood_fill#Scanline_fill",{"_key":967,"_type":65,"children":968,"markDefs":982,"style":73},"e99e8a816a18",[969,973,978],{"_key":970,"_type":69,"marks":971,"text":972},"e99e8a816a180",[],"The second, the vectorized version, is a custom implementation. It requires AVX2. It splits the image into a 2D array of small dense blocks (in my implementation the blocks are 16x16 pixels, one bit per pixel), then I run something resembling Wikipedia’s ",{"_key":974,"_type":69,"marks":975,"text":977},"e99e8a816a181",[976],"f30335992f7c","forest fire algorithm",{"_key":979,"_type":69,"marks":980,"text":981},"e99e8a816a182",[],", only instead of individual pixels I process complete blocks.",[983],{"_key":976,"_type":174,"href":984,"reference":12},"https://en.wikipedia.org/wiki/Flood_fill#Alternative_implementations",{"_key":986,"_type":65,"children":987,"markDefs":1010,"style":73},"a85cd44d88e1",[988,992,997,1001,1006],{"_key":989,"_type":69,"marks":990,"text":991},"a85cd44d88e10",[],"On the results table below, the numbers shown are in millisecond. I ran each implementation on two images: ",{"_key":993,"_type":69,"marks":994,"text":996},"a85cd44d88e11",[995],"78fc8e595307","maze-diagonal.png",{"_key":998,"_type":69,"marks":999,"text":1000},"a85cd44d88e12",[],", 2212×2212 pixels, filled from the point x=885 y=128; and ",{"_key":1002,"_type":69,"marks":1003,"text":1005},"a85cd44d88e13",[1004],"f86d030a01c4","shapes.png",{"_key":1007,"_type":69,"marks":1008,"text":1009},"a85cd44d88e14",[],", 1024x1024 pixels, filled from the same point. Due to the nature of the problem, the time it takes to fill an image depends a lot on the image and other input parameters. For the first test, I’ve deliberately picked an image which is relatively hard to flood fill.",[1011,1013],{"_key":995,"_type":174,"href":1012,"reference":12},"https://github.com/Const-me/SimdIntroArticle/blob/master/FloodFill/Images/maze-diagonal.png",{"_key":1004,"_type":174,"href":1014,"reference":12},"https://github.com/Const-me/SimdIntroArticle/blob/master/FloodFill/Images/shapes.png",{"_key":1016,"_type":53,"alt":12,"asset":1017,"caption":507,"markDefs":12},"cd03b69327a1",{"_ref":1018,"_type":56},"image-82e395e16e2986843be158e20feed569a289173f-617x109-png",{"_key":1020,"_type":65,"children":1021,"markDefs":1035,"style":73},"8242d7c96102",[1022,1026,1031],{"_key":1023,"_type":69,"marks":1024,"text":1025},"8242d7c961020",[],"As you see from the table, vectorization improved performance by a factor of 1.9-3.5, depending on CPU, compiler, and the image. Both test images are in the repository, in the ",{"_key":1027,"_type":69,"marks":1028,"text":1030},"8242d7c961021",[1029],"a8b5206259f9","FloodFill/Images",{"_key":1032,"_type":69,"marks":1033,"text":1034},"8242d7c961022",[]," subfolder.",[1036],{"_key":1029,"_type":174,"href":1037,"reference":12},"https://github.com/Const-me/SimdIntroArticle/tree/master/FloodFill/Images",{"_key":1039,"_type":65,"children":1040,"markDefs":1045,"style":82},"79342beed999",[1041],{"_key":1042,"_type":69,"marks":1043,"text":1044},"79342beed9990",[],"Conclusions",[],{"_key":1047,"_type":65,"children":1048,"markDefs":1053,"style":73},"1fe26204a9fd",[1049],{"_key":1050,"_type":69,"marks":1051,"text":1052},"1fe26204a9fd0",[],"The performance win is quite large in practice.",[],{"_key":1055,"_type":65,"children":1056,"markDefs":1061,"style":73},"588c616365ca",[1057],{"_key":1058,"_type":69,"marks":1059,"text":1060},"588c616365ca0",[],"The engineering overhead for vectorized code is not insignificant, especially for the flood fill, where the vectorized version has three to four times more code than the scalar scanline fill version. Admittedly, vectorized code is harder to read and debug; the difference fades with experience, but never disappears.",[],{"_key":1063,"_type":65,"children":1064,"markDefs":1069,"style":264},"e5de560a139a",[1065],{"_key":1066,"_type":69,"marks":1067,"text":1068},"e5de560a139a0",[],"Source Code",[],{"_key":1071,"_type":65,"children":1072,"markDefs":1094,"style":73},"ccedba439078",[1073,1077,1082,1086,1091],{"_key":1074,"_type":69,"marks":1075,"text":1076},"ccedba4390780",[],"I have posted the source code for these tests ",{"_key":1078,"_type":69,"marks":1079,"text":1081},"ccedba4390781",[1080],"a82b9a3019bd","on github",{"_key":1083,"_type":69,"marks":1084,"text":1085},"ccedba4390782",[],". It requires C++/17, and I’ve tested on Windows 10 with Visual Studio 2017 and Ubuntu Linux 18 with gcc 7.4.0. The freeware community edition of the visual studio is fine. I have only tested 64-bit builds. The code is published under the copy/paste-friendly terms of ",{"_key":1087,"_type":69,"marks":1088,"text":1090},"ccedba4390783",[1089],"85c5d54a1614","MIT license",{"_key":1092,"_type":69,"marks":1093,"text":420},"ccedba4390784",[],[1095,1097],{"_key":1080,"_type":174,"href":1096,"reference":12},"https://github.com/Const-me/SimdIntroArticle",{"_key":1089,"_type":174,"href":1098,"reference":12},"https://en.wikipedia.org/wiki/MIT_License",{"_key":1100,"_type":65,"children":1101,"markDefs":1106,"style":73},"4657724ed3d8",[1102],{"_key":1103,"_type":69,"marks":1104,"text":1105},"4657724ed3d80",[],"Because this article is targeted towards people unfamiliar with SIMD, I wrote more comments than I normally do, and I hope they help.",[],{"_key":1108,"_type":65,"children":1109,"markDefs":1114,"style":73},"c559d6b10cd7",[1110],{"_key":1111,"_type":69,"marks":1112,"text":1113},"c559d6b10cd70",[],"Here’s the commands I used to build the test projects on Linux:",[],{"_key":1116,"_type":109,"code":1117,"markDefs":12},"cfcb304a304f","mkdir build\ncd build\ncmake ../\nmake",{"_key":1119,"_type":65,"children":1120,"markDefs":1167,"style":73},"3df2f8e050b0",[1121,1125,1129,1132,1136,1140,1145,1149,1154,1158,1163],{"_key":1122,"_type":69,"marks":1123,"text":1124},"3df2f8e050b00",[],"This blog post is about intrinsics, not C++/17. The C++ parts are less than ideal, I’ve implemented bare minimum required for the benchmarks. The flood fill project includes ",{"_key":1126,"_type":69,"marks":1127,"text":1128},"3df2f8e050b01",[109],"stb_image",{"_key":1130,"_type":69,"marks":1131,"text":724},"3df2f8e050b02",[],{"_key":1133,"_type":69,"marks":1134,"text":1135},"3df2f8e050b03",[109],"stb_image_write",{"_key":1137,"_type":69,"marks":1138,"text":1139},"3df2f8e050b04",[]," third-party libraries to handle PNG images: ",{"_key":1141,"_type":69,"marks":1142,"text":1144},"3df2f8e050b05",[1143],"d712a65c9983","http://nothings.org/stb",{"_key":1146,"_type":69,"marks":1147,"text":1148},"3df2f8e050b06",[],". Again, this is not something I would probably do in a production-quality C++ code. OS-provided image codecs are generally better, ",{"_key":1150,"_type":69,"marks":1151,"text":1153},"3df2f8e050b07",[1152],"856b0b73bcaa","libpng",{"_key":1155,"_type":69,"marks":1156,"text":1157},"3df2f8e050b08",[]," on Linux, or ",{"_key":1159,"_type":69,"marks":1160,"text":1162},"3df2f8e050b09",[1161],"d3b88230f30a","WIC",{"_key":1164,"_type":69,"marks":1165,"text":1166},"3df2f8e050b010",[]," on Windows.",[1168,1169,1171],{"_key":1143,"_type":174,"href":1144,"reference":12},{"_key":1152,"_type":174,"href":1170,"reference":12},"http://www.libpng.org/",{"_key":1161,"_type":174,"href":1172,"reference":12},"https://en.wikipedia.org/wiki/Windows_Imaging_Component",{"_key":1174,"_type":65,"children":1175,"markDefs":1180,"style":73},"0e3e72e09668",[1176],{"_key":1177,"_type":69,"marks":1178,"text":1179},"0e3e72e096680",[],"I hope this gives you a sense of what’s possible when you tap into the power of SIMD intrinsics.",[],true,"2020/07/08","Many developers write software that’s performance sensitive. After all, that’s one of the major reasons why we still pick C or C++ language these days. When done right, supplementing C or C++ code with vector intrinsics is exceptionally good for performance.",{"_type":53,"asset":1185},{"_ref":1186,"_type":56},"image-4a5c246343c6c74afe5f5f9d98c77c23e3dc8b8b-2560x1440-jpg",{"code":1188,"language":1189},"\u003C!-- wp:paragraph -->\n\u003Cp>When done right, supplementing C or C++ code with vector intrinsics is exceptionally good for performance. For the cases presented in this blog post, vectorization improved performance by a factor of 3 to 12.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading {\"level\":1} -->\n\u003Ch1>Introduction\u003C/h1>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Many developers write software that’s performance sensitive. After all, that’s one of the major reasons why we still pick C or C++ language these days.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>All modern processors are actually vector under the hood. Unlike scalar processors, which process data individually, modern vector processors process one-dimensional arrays of data. If you want to maximize performance, you need to write code tailored to these vectors.&nbsp;\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Every time you write \u003Ccode>float s = \u003Cem>a\u003C/em> + \u003Cem>b\u003C/em>;\u003C/code> you’re leaving a lot of performance on the table. The processor could have added four float numbers to another four numbers, or even eight numbers to another eight numbers if that processor supports \u003Ca href=\"https://en.wikipedia.org/wiki/Advanced_Vector_Extensions\">AVX\u003C/a>. Similarly, when you write \u003Ccode>int i = \u003Cem>j\u003C/em> + \u003Cem>k\u003C/em>;\u003C/code> to add 2 integer numbers, you could have added four or eight numbers instead, with corresponding \u003Ca href=\"https://en.wikipedia.org/wiki/SSE2\">SSE2\u003C/a> or AVX2 instructions.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Language designers, compiler developers, and other smart people have been trying for many years to compile scalar code into vector instructions in a way that would leverage the performance potential. So far, none of them have completely succeeded, and I’m not convinced it’s possible.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>One approach to leverage vector hardware are SIMD intrinsics, available in all modern C or C++ compilers. SIMD \u003Ca href=\"https://en.wikipedia.org/wiki/SIMD\">stands for\u003C/a> “single Instruction, multiple data”. \u003Ca href=\"https://en.wikipedia.org/wiki/SIMD\">SIMD\u003C/a> instructions are available on many platforms, there’s a high chance your smartphone has it too, through the architecture extension \u003Ca href=\"https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)\">ARM NEON\u003C/a>. This article focuses on PCs and servers running on modern AMD64 processors.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Even with the focus on AMD64 platform, the topic is way too broad for a single blog post. Modern SIMD instructions were introduced to Pentium processors with the release of Pentium 3 in 1999 (that instruction set is \u003Ca href=\"https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions\">SSE\u003C/a>, nowadays it’s sometimes called SSE&nbsp;1), more of them have been added since then. For a more in-depth introduction, you can read \u003Ca href=\"http://const.me/articles/simd/simd.pdf\">my other article on the subject.\u003C/a> Unlike this blog post, that one doesn’t have practical problems nor benchmarks, instead it tries to provide an overview of what’s available.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>What are vector intrinsics?\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>To a programmer, intrinsics look just like regular library functions; you include the relevant header, and you can use the intrinsic. To add four float numbers to another four numbers, use the \u003Cem>\u003Ccode>_mm_add_ps\u003C/code>\u003C/em> intrinsic in your code. In the compiler-provided header declaring that intrinsic, \u003Ccode>&lt;xmmintrin.h&gt;\u003C/code>, you’ll find this declaration (Assuming you’re using VC++ compiler. In GCC you’ll see something different, which provides the same API to a user.):\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:preformatted -->\n\u003Cpre class=\"wp-block-preformatted\">extern __m128 \u003Cem>_mm_add_ps\u003C/em>( __m128 _A, __m128 _B );\u003C/pre>\n\u003C!-- /wp:preformatted -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>But unlike library functions, intrinsics are implemented directly in compilers. The above\u003Ccode> \u003Cem>_mm_add_ps\u003C/em>\u003C/code> SSE intrinsic typically\u003Csup>1\u003C/sup> compiles into a single instruction, \u003Ca href=\"https://www.felixcloutier.com/x86/addps\">addps\u003C/a>. For the time it takes CPU to call a library function, it might have completed a dozen of these instructions.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>\u003Csup>1\u003C/sup>(That instruction can fetch one of the arguments from memory, but not both. If you call it in a way so the compiler has to load both arguments from memory, like this \u003Ccode>__m128 sum = \u003Cem>_mm_add_ps\u003C/em>( *p1, *p2 );\u003C/code> the compiler will emit two instructions: the first one to load an argument from memory into a register, the second one to add the four values.)\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The \u003Ccode>__m128\u003C/code> built-in data type is a vector of four floating point numbers; 32 bits each, 128 bits in total. CPUs have wide registers for that data type, 128 bits per register. Since AVX was introduced in 2011, in current PC processors these registers are 256 bits wide, each one of them can fit eight float values, four double-precision float values, or a large number of integers, depending on their size.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Source code that contains sufficient amounts of vector intrinsics or embeds their assembly equivalents is called manually vectorized code. Modern compilers and libraries already implement a lot of stuff with them using intrinsics, assembly, or a combination of the two. For example, some implementations of the \u003Ccode>memset\u003C/code>, \u003Ccode>memcpy\u003C/code>, or \u003Ccode>memmove\u003C/code> standard C library routines use SSE2 instructions for better throughput. Yet outside of niche areas like high-performance computing, game development, or compiler development, even very experienced C and C++ programmers are largely unfamiliar with SIMD intrinsics.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>To help demonstrate, I’m going to present three practical problems and discuss how SIMD helped.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading {\"level\":1} -->\n\u003Ch1>Image processing: grayscale\u003C/h1>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Suppose that we need to write a function that converts RGB image to grayscale. \u003Ca href=\"https://stackoverflow.com/q/58881359/126995\">Someone asked this very question recently\u003C/a>.&nbsp;\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Many practical applications need code like this. For example, when you compress raw image data to JPEG or video data to H.264 or H.265, the first step of the compression is quite similar. Specifically, compressors convert RGB pixels into \u003Ca href=\"https://en.wikipedia.org/wiki/YUV\">YUV\u003C/a> color space. The exact color space is defined in the specs of these formats—for video, it’s often \u003Ca href=\"https://www.itu.int/rec/R-REC-BT.709/en\">ITU-R BT.709\u003C/a> these days See section 3, “Signal format” of that spec.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Performance comparison\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>I’ve implemented a few versions, vectorized and not, and tested them with random images. Mydesktop has an \u003Ca href=\"https://www.amd.com/en/products/cpu/amd-ryzen-5-3600\">AMD Ryzen 5 3600\u003C/a> plugged in, my laptop has an \u003Ca href=\"https://ark.intel.com/content/www/us/en/ark/products/96484/intel-core-i3-6157u-processor-3m-cache-2-40-ghz.html\">Intel i3-6157U\u003C/a> soldered. \u003Ca href=\"https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux\">WSL\u003C/a> column has results from the same desktop, but for a Linux binary built with GCC 7.4. The three rightmost columns of the table contain time in milliseconds (best of five runs), for an image of 3840x2160 pixels.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:image {\"id\":16228,\"sizeSlug\":\"large\"} -->\n\u003Cfigure class=\"wp-block-image size-large\">\u003Cimg src=\"https://stackoverflow.blog/wp-content/uploads/2020/07/image.png\" alt=\"\" class=\"wp-image-16228\"/>\u003C/figure>\n\u003C!-- /wp:image -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Observations\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Vectorized versions are three to eight times faster than scalar code. On the laptop, the scalar version is likely too slow to handle 60 FPS video of frames of this size, while the performance of vectorized code is OK for that.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The best way to vectorize that particular algorithm appears to be fixed-point 16-bit math. Vector registers fit twice as many 16-bit integers as 32-bit floats, allowing to process twice as many pixels in parallel spending approximately the same time. On my desktop, \u003Ccode>_mm_mul_ps\u003C/code> SSE 1 intrinsic (multiplies four floats from 128-bit registers) has 3 cycles latency, and 0.5 cycles throughput. \u003Ccode>_mm_mulhi_epu16\u003C/code> SSE 2 intrinsic (multiplies eight fixed-point numbers from 128-bit registers) has the same 3 cycles latency and 1 cycle throughput.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>In my experience, this outcome is common for image and video processing on CPU, not just for this particular grayscale problem.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>On the desktop, upgrading from SSE to AVX—with twice as wide SIMD vectors—only improved performance a tiny bit. On the laptop it helped substantially. A likely reason for that&nbsp;is the RAM bandwidth bottleneck on the desktop. This is common, too, over the course of many years, CPU performance has been growing somewhat faster than memory bandwidth.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading {\"level\":1} -->\n\u003Ch1>General math: dot product\u003C/h1>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Write a function to compute a dot product of two float vectors. \u003Ca href=\"https://stackoverflow.com/q/59494745/126995\">Here’s a relevant Stack Overflow question\u003C/a>. A popular application for dot products these days is \u003Ca href=\"https://stats.stackexchange.com/a/291760/\">machine learning\u003C/a>.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Performance comparison\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>I didn’t want to bottleneck on memory again, so I’ve made a test that computes a dot product of 256k-long vectors, taking 1MB RAM each. That amount of data fits in processor caches on both computers I’m using for benchmarks: the desktop has a 3MB L2 cache and a 32MB L3 cache, the laptop has a 3MB L3 cache and a 64MB L4 cache. The three rightmost columns are microseconds (µs), best of ten runs.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:image {\"id\":16229,\"sizeSlug\":\"large\"} -->\n\u003Cfigure class=\"wp-block-image size-large\">\u003Cimg src=\"https://stackoverflow.blog/wp-content/uploads/2020/07/image-1.png\" alt=\"\" class=\"wp-image-16229\"/>\u003C/figure>\n\u003C!-- /wp:image -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Observations\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Best versions are 5-12 times faster than scalar code.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The best SSE1-only version, \u003Ccode>SseVertical4\u003C/code>, delivered close performance to AVX+FMA. A likely reason for that is memory bandwidth. The source data is in the cache, so the bandwidth itself is very high. However, CPUs can only do a couple loads per cycle. The code reads from two input arrays at once and is likely to hit that limit.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>When built with VC++, single accumulator non-FMA SSE and especially AVX versions performed surprisingly well. I’ve looked at the disassembly. The compiler managed to hide some latency with instructions reordering. The code computes the product, increments the pointers, adds product to the accumulator, and finally tests for loop exit condition. This way, vector and scalar instructions are interleaved, hiding the latency of both. To an extent: the four-accumulators version is still faster.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The GCC-built scalar version is quite slow. This might be caused by my compiler options in \u003Ccode>CMakeLists.txt\u003C/code>. I’m not sure they’re good enough, because for the last few years, I only built Linux software running on ARM devices.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Why multiple accumulators?\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Data dependencies is the main thing I’d like to illustrate with this example.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>From a computer scientist point of view, dot product is a form of \u003Ca href=\"https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)\">reduction\u003C/a>.\u003Ca href=\"https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)\"> \u003C/a>The algorithm needs to process large input vectors, and compute just a single value. When the computations are fast (like in this case, multiplying floats from sequential blocks of memory is very fast), the throughput is often limited by latency of the reduce operation.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Let’s compare code of two specific versions, \u003Ccode>AvxVerticalFma\u003C/code> and \u003Ccode>AvxVerticalFma2\u003C/code>. The former has the following main loop:\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:preformatted -->\n\u003Cpre class=\"wp-block-preformatted\">for( ; p1 &lt; p1End; p1 += 8, p2 += 8 )\n{\n          const __m256 a = \u003Cem>_mm256_loadu_ps\u003C/em>( p1 );\n          const __m256 b = \u003Cem>_mm256_loadu_ps\u003C/em>( p2 );\n          acc = _mm256_fmadd_ps( a, b, acc ); // Update the only accumulator\n}\n\u003C/pre>\n\u003C!-- /wp:preformatted -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>\u003Ccode>AvxVerticalFma2\u003C/code> version runs following code:\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:preformatted -->\n\u003Cpre class=\"wp-block-preformatted\">for( ; p1 &lt; p1End; p1 += 16, p2 += 16 )\n{\n          __m256 a = \u003Cem>_mm256_loadu_ps\u003C/em>( p1 );\n          __m256 b = \u003Cem>_mm256_loadu_ps\u003C/em>( p2 );\n          dot0 = \u003Cem>_mm256_fmadd_ps\u003C/em>( a, b, dot0 ); // Update the first accumulator\n          a = \u003Cem>_mm256_loadu_ps\u003C/em>( p1 + 8 );\n          b = \u003Cem>_mm256_loadu_ps\u003C/em>( p2 + 8 );\n          dot1 = \u003Cem>_mm256_fmadd_ps\u003C/em>( a, b, dot1 ); // Update the second accumulator\n}\u003C/pre>\n\u003C!-- /wp:preformatted -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>\u003Cem>\u003Ccode>_mm256_fmadd_ps\u003C/code>\u003C/em> intrinsic computes (a*b)+c for arrays of eight float values, that instruction is part of \u003Ca href=\"https://en.wikipedia.org/wiki/FMA_instruction_set#FMA3_instruction_set\">FMA3\u003C/a> instruction set. The reason why \u003Ccode>AvxVerticalFma2\u003C/code> version is almost 2x faster—deeper pipelining hiding the latency.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>When the processor submits an instruction, it needs values of the arguments. If some of them are not yet available, the processor waits for them to arrive. The tables on \u003Ca href=\"https://www.agner.org/\">https://www.agner.org/\u003C/a> say on AMD Ryzen the latency of that FMA instruction is five cycles. This means once the processor started to execute that instruction, the result of the computation will only arrive five CPU cycles later. When the loop is running a single FMA instruction which needs the result computed by the previous loop iteration, that loop can only run one iteration in five CPU cycles.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>With two accumulators that limit is the same, five cycles. However, the loop body now contains two FMA instructions that don’t depend on each other. These two instructions run in parallel, and the code delivers twice the throughput on the desktop.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Not the case on the laptop, though. The laptop was clearly bottlenecked on something else, but I’m not sure what was that.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Bonus chapter: precision issues\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Initially this benchmark used much larger vectors at 256 MB each. I quickly discovered the performance in that case was limited by memory bandwidth, with not much differentiation showing up in the results.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>There was another interesting issue, however.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Besides just measuring the time, my test program prints the computed dot product. This is to make sure compilers don’t optimize away the code and to check that the result is the same across my two computers and 15 implementations.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>I was surprised to see the scalar version printed 1.31E+7 while all other versions printed 1.67E+7. Initially, I thought it was a bug somewhere. I implemented a scalar version that uses double-precision accumulator, and sure enough, it printed 1.67E+7.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>That whopping 20% error was caused by accumulation order. When a code adds a small float value to a large float value, a lot of precision is lost. An extreme example: when the first float value is larger than 8.4 million and the second value is smaller than 1.0, it won’t add anything at all. It will just return the larger of the two arguments!\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Technically, you can often achieve a more precise result with a \u003Ca href=\"https://en.wikipedia.org/wiki/Pairwise_summation\">pairwise summation\u003C/a> approach. My vectorized code doesn’t quite do that. Still, the four-accumulators AVX version accumulates 32 independent scalar values (four registers with eight floats each), which is a step in the same direction. When there are 64 million numbers to sum up, 32 independent accumulators helped a lot with the precision.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading {\"level\":1} -->\n\u003Ch1>Image processing: flood fill\u003C/h1>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>For the final part of the article, I’ve picked a slightly more complicated problem.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>For a layman, flood fill is what happens when you open an image in an editor, select the “paint bucket” tool, and click on the image. Mathematically, it’s a \u003Ca href=\"https://en.wikipedia.org/wiki/Connected-component_labeling\">connected-component labeling\u003C/a> operating on a regular 2D grid graph.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Unlike the first two problems, it’s not immediately clear how to vectorize this one. It’s not an \u003Ca href=\"https://en.wikipedia.org/wiki/Embarrassingly_parallel\">embarrassingly parallel\u003C/a> problem. In fact, flood fill is quite hard to efficiently implement on \u003Ca href=\"https://en.wikipedia.org/wiki/General-purpose_computing_on_graphics_processing_units\">GPGPU\u003C/a>. Still, with some efforts, it’s possible to use SIMD in a way that significantly outperforms scalar code.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Because of the complexity, I only created two implementations. The first, the scalar version, is scanline fill, \u003Ca href=\"https://en.wikipedia.org/wiki/Flood_fill#Scanline_fill\">described in Wikipedia\u003C/a>. Not too optimized, but not particularly slow either.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The second, the vectorized version, is a custom implementation. It requires AVX2. It splits the image into a 2D array of small dense blocks (in my implementation the blocks are 16x16 pixels, one bit per pixel), then I run something resembling Wikipedia’s \u003Ca href=\"https://en.wikipedia.org/wiki/Flood_fill#Alternative_implementations\">forest fire algorithm\u003C/a>, only instead of individual pixels I process complete blocks.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>On the results table below, the numbers shown are in millisecond. I ran each implementation on two images: \u003Ca href=\"https://github.com/Const-me/SimdIntroArticle/blob/master/FloodFill/Images/maze-diagonal.png\">maze-diagonal.png\u003C/a>, 2212×2212 pixels, filled from the point x=885 y=128; and \u003Ca href=\"https://github.com/Const-me/SimdIntroArticle/blob/master/FloodFill/Images/shapes.png\">shapes.png\u003C/a>, 1024x1024 pixels, filled from the same point. Due to the nature of the problem, the time it takes to fill an image depends a lot on the image and other input parameters. For the first test, I’ve deliberately picked an image which is relatively hard to flood fill.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:image {\"id\":16230,\"sizeSlug\":\"large\"} -->\n\u003Cfigure class=\"wp-block-image size-large\">\u003Cimg src=\"https://stackoverflow.blog/wp-content/uploads/2020/07/image-2.png\" alt=\"\" class=\"wp-image-16230\"/>\u003C/figure>\n\u003C!-- /wp:image -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>As you see from the table, vectorization improved performance by a factor of 1.9-3.5, depending on CPU, compiler, and the image. Both test images are in the repository, in the \u003Ca href=\"https://github.com/Const-me/SimdIntroArticle/tree/master/FloodFill/Images\">FloodFill/Images\u003C/a> subfolder.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading {\"level\":1} -->\n\u003Ch1>Conclusions\u003C/h1>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The performance win is quite large in practice.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>The engineering overhead for vectorized code is not insignificant, especially for the flood fill, where the vectorized version has three to four times more code than the scalar scanline fill version. Admittedly, vectorized code is harder to read and debug; the&nbsp;difference fades with experience, but never disappears.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:heading -->\n\u003Ch2>Source Code\u003C/h2>\n\u003C!-- /wp:heading -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>I have posted the source code for these tests \u003Ca href=\"https://github.com/Const-me/SimdIntroArticle\">on github\u003C/a>. It requires C++/17, and I’ve tested on Windows 10 with Visual Studio 2017 and Ubuntu Linux 18 with gcc 7.4.0. The freeware community edition of the visual studio is fine. I have only tested 64-bit builds. The code is published under the copy/paste-friendly terms of \u003Ca href=\"https://en.wikipedia.org/wiki/MIT_License\">MIT license\u003C/a>.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Because this article is targeted towards people unfamiliar with SIMD, I wrote more comments than I normally do, and I hope they help.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>Here’s the commands I used to build the test projects on Linux:\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:preformatted -->\n\u003Cpre class=\"wp-block-preformatted\">mkdir build\ncd build\ncmake ../\nmake\u003C/pre>\n\u003C!-- /wp:preformatted -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>This blog post is about intrinsics, not C++/17. The C++ parts are less than ideal, I’ve implemented bare minimum required for the benchmarks. The flood fill project includes \u003Ccode>stb_image\u003C/code> and \u003Ccode>stb_image_write\u003C/code> third-party libraries to handle PNG images: \u003Ca href=\"http://nothings.org/stb\">http://nothings.org/stb\u003C/a>. Again, this is not something I would probably do in a production-quality C++ code. OS-provided image codecs are generally better, \u003Ca href=\"http://www.libpng.org/\">libpng\u003C/a> on Linux, or \u003Ca href=\"https://en.wikipedia.org/wiki/Windows_Imaging_Component\">WIC\u003C/a> on Windows.\u003C/p>\n\u003C!-- /wp:paragraph -->\n\n\u003C!-- wp:paragraph -->\n\u003Cp>I hope this gives you a sense of what’s possible when you tap into the power of SIMD intrinsics.&nbsp;\u003C/p>\n\u003C!-- /wp:paragraph -->","html","2020-07-08T14:00:00.000Z",{"current":1192},"improving-performance-with-simd-intrinsics-in-three-use-cases",[1194,1202,1207],{"_createdAt":1195,"_id":1196,"_rev":1197,"_type":1198,"_updatedAt":1195,"slug":1199,"title":1201},"2023-05-23T16:43:21Z","wp-tagcat-code-for-a-living","9HpbCsT2tq0xwozQfkc4ih","blogTag",{"current":1200},"code-for-a-living","Code for a Living",{"_createdAt":1195,"_id":1203,"_rev":1197,"_type":1198,"_updatedAt":1195,"slug":1204,"title":1206},"wp-tagcat-parallel-processing",{"current":1205},"parallel-processing","parallel processing",{"_createdAt":1195,"_id":1208,"_rev":1197,"_type":1198,"_updatedAt":1195,"slug":1209,"title":1210},"wp-tagcat-simd",{"current":1210},"simd","Improving performance with SIMD intrinsics in three use cases",[1213,1219,1225,1231],{"_id":1214,"publishedAt":1215,"slug":1216,"sponsored":12,"title":1218},"28e560af-f0aa-4d46-bd90-f435ad604aa7","2026-06-26T14:00:27.102Z",{"_type":10,"current":1217},"paging-charity-how-can-engineering-leaders-avoid-becoming-bond-villains","Paging Charity! How can engineering leaders avoid becoming Bond villains?",{"_id":1220,"publishedAt":1221,"slug":1222,"sponsored":12,"title":1224},"4b22c2a3-3779-4966-93eb-5230391dbdce","2026-06-23T14:08:58.595Z",{"_type":10,"current":1223},"your-ai-shipped-a-backend-that-boots-that-is-the-whole-problem","Your AI shipped a backend that boots. That is the whole problem.",{"_id":1226,"publishedAt":1227,"slug":1228,"sponsored":12,"title":1230},"5cf362e1-fe7b-45af-b69c-914731c6a052","2026-06-23T14:00:00.000Z",{"_type":10,"current":1229},"the-2026-developer-survey-is-now-open-for-human-developers-only","The 2026 Developer Survey is now open (for human developers only)!",{"_id":1232,"publishedAt":1233,"slug":1234,"sponsored":12,"title":1236},"30b995f7-7cb9-4dd8-bf71-d0685940a32b","2026-06-19T14:00:00.000Z",{"_type":10,"current":1235},"dispatches-from-o-reilly-from-capabilities-to-responsibilities","Dispatches from O'Reilly: From capabilities to responsibilities",{"data":1238,"sourceMap":-1},{"count":1239,"lastTimestamp":1240},25,"2023-05-25T09:47:22Z"]