CARVIEW |
Select Language
HTTP/2 200
date: Wed, 23 Jul 2025 08:58:27 GMT
content-type: text/html; charset=utf-8
vary: X-PJAX, X-PJAX-Container, Turbo-Visit, Turbo-Frame, X-Requested-With,Accept-Encoding, Accept, X-Requested-With
etag: W/"e5e22177fe3f0ac02c558ff0e801e0a3"
cache-control: max-age=0, private, must-revalidate
strict-transport-security: max-age=31536000; includeSubdomains; preload
x-frame-options: deny
x-content-type-options: nosniff
x-xss-protection: 0
referrer-policy: origin-when-cross-origin, strict-origin-when-cross-origin
content-security-policy: default-src 'none'; base-uri 'self'; child-src github.githubassets.com github.com/assets-cdn/worker/ github.com/assets/ gist.github.com/assets-cdn/worker/; connect-src 'self' uploads.github.com www.githubstatus.com collector.github.com raw.githubusercontent.com api.github.com github-cloud.s3.amazonaws.com github-production-repository-file-5c1aeb.s3.amazonaws.com github-production-upload-manifest-file-7fdce7.s3.amazonaws.com github-production-user-asset-6210df.s3.amazonaws.com *.rel.tunnels.api.visualstudio.com wss://*.rel.tunnels.api.visualstudio.com objects-origin.githubusercontent.com copilot-proxy.githubusercontent.com proxy.individual.githubcopilot.com proxy.business.githubcopilot.com proxy.enterprise.githubcopilot.com *.actions.githubusercontent.com wss://*.actions.githubusercontent.com productionresultssa0.blob.core.windows.net/ productionresultssa1.blob.core.windows.net/ productionresultssa2.blob.core.windows.net/ productionresultssa3.blob.core.windows.net/ productionresultssa4.blob.core.windows.net/ productionresultssa5.blob.core.windows.net/ productionresultssa6.blob.core.windows.net/ productionresultssa7.blob.core.windows.net/ productionresultssa8.blob.core.windows.net/ productionresultssa9.blob.core.windows.net/ productionresultssa10.blob.core.windows.net/ productionresultssa11.blob.core.windows.net/ productionresultssa12.blob.core.windows.net/ productionresultssa13.blob.core.windows.net/ productionresultssa14.blob.core.windows.net/ productionresultssa15.blob.core.windows.net/ productionresultssa16.blob.core.windows.net/ productionresultssa17.blob.core.windows.net/ productionresultssa18.blob.core.windows.net/ productionresultssa19.blob.core.windows.net/ github-production-repository-image-32fea6.s3.amazonaws.com github-production-release-asset-2e65be.s3.amazonaws.com insights.github.com wss://alive.github.com api.githubcopilot.com api.individual.githubcopilot.com api.business.githubcopilot.com api.enterprise.githubcopilot.com; font-src github.githubassets.com; form-action 'self' github.com gist.github.com copilot-workspace.githubnext.com objects-origin.githubusercontent.com; frame-ancestors 'none'; frame-src viewscreen.githubusercontent.com notebooks.githubusercontent.com; img-src 'self' data: blob: github.githubassets.com media.githubusercontent.com camo.githubusercontent.com identicons.github.com avatars.githubusercontent.com private-avatars.githubusercontent.com github-cloud.s3.amazonaws.com objects.githubusercontent.com release-assets.githubusercontent.com secured-user-images.githubusercontent.com/ user-images.githubusercontent.com/ private-user-images.githubusercontent.com opengraph.githubassets.com copilotprodattachments.blob.core.windows.net/github-production-copilot-attachments/ github-production-user-asset-6210df.s3.amazonaws.com customer-stories-feed.github.com spotlights-feed.github.com objects-origin.githubusercontent.com *.githubusercontent.com; manifest-src 'self'; media-src github.com user-images.githubusercontent.com/ secured-user-images.githubusercontent.com/ private-user-images.githubusercontent.com github-production-user-asset-6210df.s3.amazonaws.com gist.github.com; script-src github.githubassets.com; style-src 'unsafe-inline' github.githubassets.com; upgrade-insecure-requests; worker-src github.githubassets.com github.com/assets-cdn/worker/ github.com/assets/ gist.github.com/assets-cdn/worker/
server: github.com
content-encoding: gzip
accept-ranges: bytes
set-cookie: _gh_sess=Vvsnb5cTTH8CSUtMkDNCxkG48LKoCkKEgvlK%2FcPlEjrdO2Lr0SI3ZIHH506TG4Wj%2F7eJSIy1j5%2FtkZsNUw8WlOJk5MAhXiBMUqtYiVcphWg6bU9WoJ6QU0d6%2B%2FsRToeEHjQR80%2FZkEPQ5hmh0wTqYKDS1XZeP%2FNmecWm2OpXf39eiClp0RAwVlXkICdba3kJ4DRrxee9ArkIS%2Bwfp0MPAPmWAkjYZiaozvk%2FBkv8%2Bg%2BsvCDmobJ5yGftTsVfgONkZq2xDEtce764ha7Tquy8jg%3D%3D--pd9XOJhaTycNZuJ2--%2Foj1e%2BCW3nGL%2F9WiJ%2F%2BQgQ%3D%3D; Path=/; HttpOnly; Secure; SameSite=Lax
set-cookie: _octo=GH1.1.884930863.1753261107; Path=/; Domain=github.com; Expires=Thu, 23 Jul 2026 08:58:27 GMT; Secure; SameSite=Lax
set-cookie: logged_in=no; Path=/; Domain=github.com; Expires=Thu, 23 Jul 2026 08:58:27 GMT; HttpOnly; Secure; SameSite=Lax
x-github-request-id: B26A:1D735C:7A0135:967B67:6880A433
crc32.cpp · GitHub
Show Gist options
Save dougallj/263f132023f590aec31a11bbc746b897 to your computer and use it in GitHub Desktop.
{{ message }}
Instantly share code, notes, and snippets.
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save dougallj/263f132023f590aec31a11bbc746b897 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Demo for "Faster CRC32 on the Apple M1" | |
// https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ | |
// | |
// I rarely use C++, but I had some fun using constexpr functions to | |
// compute the various constants, so you can just specify the polynomial | |
// as a template parameter. | |
// | |
// THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED | |
// WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF | |
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | |
#include <array> | |
#include <cassert> | |
#include <cstddef> | |
#include <cstdint> | |
#include <type_traits> | |
#include <utility> | |
#include <arm_neon.h> | |
#ifndef NO_CRC_INSNS | |
#include <arm_acle.h> | |
#endif | |
// How many independent latency chains to use | |
inline constexpr int NUM_CHAINS = 12; | |
static_assert(NUM_CHAINS > 0); // required for correctness | |
static_assert(NUM_CHAINS < 1000); // what is this, a gpu? | |
// CRC polynomial, reflected, with implicit x^32 | |
inline constexpr uint64_t CRC32_POLY = 0xedb88320; // CRC-32 | |
inline constexpr uint64_t CRC32C_POLY = 0x82f63b78; // CRC-32C | |
template <uint32_t Poly, int NumChains> | |
uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size); | |
uint32_t crc32(uint32_t crc, uint8_t *p, size_t size) { | |
return generic_crc32<CRC32_POLY, NUM_CHAINS>(crc, p, size); | |
} | |
uint32_t crc32c(uint32_t crc, uint8_t *p, size_t size) { | |
return generic_crc32<CRC32C_POLY, NUM_CHAINS>(crc, p, size); | |
} | |
template <uint32_t Poly> | |
constexpr uint32_t slow_crc32b(uint32_t v, uint64_t d) { | |
v ^= d; | |
for (int i = 0; i < 8; i++) | |
v = (v >> 1) ^ (v & 1 ? Poly : 0); | |
return v; | |
} | |
template <uint32_t Poly> constexpr auto build_crc32_table() { | |
std::array<uint32_t, 256> arr{}; | |
for (int i = 0; i < 256; i++) | |
arr[i] = slow_crc32b<Poly>(0, i); | |
return arr; | |
} | |
template <uint32_t Poly> | |
inline constexpr auto crc32_table = build_crc32_table<Poly>(); | |
template <uint32_t Poly> | |
__attribute__((always_inline)) static uint32_t crc32b(uint32_t v, uint8_t d) { | |
#ifndef NO_CRC_INSNS | |
if constexpr (Poly == CRC32_POLY) | |
return __crc32b(v, d); | |
else if constexpr (Poly == CRC32C_POLY) | |
return __crc32cb(v, d); | |
else | |
#endif | |
return crc32_table<Poly>[d ^ (v & 0xFF)] ^ (v >> 8); | |
} | |
template <uint32_t Poly> | |
__attribute__((always_inline)) static uint32_t crc32d(uint32_t v, uint64_t d) { | |
#ifndef NO_CRC_INSNS | |
if constexpr (Poly == CRC32_POLY) | |
return __crc32d(v, d); | |
else if constexpr (Poly == CRC32C_POLY) | |
return __crc32cd(v, d); | |
else | |
#endif | |
{ | |
for (int i = 0; i < 8; i++) | |
v = crc32b<Poly>(v, (d >> (i * 8)) & 0xFF); | |
return v; | |
} | |
} | |
template <uint32_t Poly> inline constexpr uint32_t x_to_n_mod_p(int n) { | |
uint32_t r = (uint32_t)1 << 31; | |
for (int i = 0; i < n; i++) { | |
r = (r >> 1) ^ (r & 1 ? Poly : 0); | |
} | |
return r; | |
} | |
template <uint32_t Poly> inline constexpr uint64_t x_to_n_div_p(int n) { | |
uint32_t r = (uint32_t)1 << 31; | |
uint64_t q = 0; | |
for (int i = 0; i < n; i++) { | |
q |= (uint64_t)(r & 1) << i; | |
r = (r >> 1) ^ (r & 1 ? Poly : 0); | |
} | |
return q; | |
} | |
template <uint32_t Poly> inline constexpr uint64_t k_shift(int n) { | |
return (uint64_t)x_to_n_mod_p<Poly>(n) << 1; | |
} | |
__attribute__((always_inline)) static uint8x16_t | |
reduce(uint8x16_t a, uint8x16_t b, poly64x2_t k) { | |
asm("pmull.1q v0, %[val], %[consts] \n\t" | |
"eor.16b v0, v0, %[data] \n\t" | |
"pmull2.1q %[val], %[val], %[consts] \n\t" | |
"eor.16b %[val], %[val], v0 \n\t" | |
: [val] "+w"(a) | |
: [consts] "w"(k), [data] "w"(b) | |
: "v0"); | |
return a; | |
} | |
template <uint32_t Poly, int NumChains> | |
uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size) { | |
uint8_t *end = p + size; | |
// align | |
uint32_t result = ~crc; | |
while (p != end && ((uintptr_t)p & 15) != 0) { | |
result = crc32b<Poly>(result, *p++); | |
} | |
// NOTE: really not optimised for small sizes, but at least this seems | |
// to make it work. | |
size = (uintptr_t)end - (uintptr_t)p; | |
if (end - p < 16 * NumChains) { | |
while (size > 8) { | |
uint64_t word; | |
memcpy(&word, p, sizeof word); | |
result = crc32d<Poly>(result, word); | |
p += 8; | |
size -= 8; | |
} | |
while (size > 0) { | |
result = crc32b<Poly>(result, *p); | |
p++; | |
size--; | |
} | |
return ~result; | |
} | |
// load first 16 * NumChains chunk | |
uint8x16_t vals[NumChains]; | |
for (int i = 0; i < NumChains; i++) { | |
vals[i] = vld1q_u8(p); | |
p += 0x10; | |
} | |
// fold in the initial crc value | |
uint32x4_t init = {result, 0, 0, 0}; | |
vals[0] = veorq_u8(vals[0], init); | |
// fold in 16 * NumChains bytes at a time | |
constexpr poly64x2_t k1k2 = {k_shift<Poly>(NumChains * 128 + 32), | |
k_shift<Poly>(NumChains * 128 - 32)}; | |
size = (uintptr_t)end - (uintptr_t)p; | |
size_t fast_size = size / (16 * NumChains) * (16 * NumChains); | |
uint8_t *fast_end = p + fast_size; | |
while (p != fast_end) { | |
#pragma unroll | |
for (int i = 0; i < NumChains; i++) { | |
vals[i] = reduce(vals[i], vld1q_u8(p), k1k2); | |
p += 0x10; | |
} | |
} | |
// fold down to 16-bytes | |
constexpr poly64x2_t k3k4 = {k_shift<Poly>(128 + 32), | |
k_shift<Poly>(128 - 32)}; | |
// TODO: use a less serial reduction since this has ~72c latency | |
uint64x2_t x = vals[0]; | |
for (int i = 1; i < NumChains; i++) | |
x = reduce(x, vals[i], k3k4); | |
// fold remaining 16-bytes chunks | |
size = (uintptr_t)end - (uintptr_t)p; | |
while (size >= 16) { | |
x = reduce(x, vld1q_u8(p), k3k4); | |
p += 0x10; | |
size -= 0x10; | |
} | |
uint64x2_t message128 = x; | |
#ifndef NO_CRC_INSNS | |
// This is simpler and faster if we can use the CRC32 instructions | |
if constexpr (Poly == CRC32_POLY || Poly == CRC32C_POLY) { | |
result = crc32d<Poly>(0, x[0]); | |
result = crc32d<Poly>(result, x[1]); | |
} else | |
#endif | |
{ | |
// Barrett reduction technique | |
// Explanation may be incorrect, just records my current best understanding. | |
// CRC32(M(x)) = x^32 • M(x) mod P(x) | |
// | |
// Our message128 register contains M(x), so | |
// | |
// x^32 • M(x) = | |
// | |
// [------------------------- message128 -------------------------][- zeros (32) -] | |
// | |
// Do two more folds to reduce this to 64-bits, so we can use the 64-bit | |
// PMULL instructions to perform the Barrett reduction. | |
// First fold down to 96-bit: | |
// [-------- message128[0] -------][-------- message128[1] -------][- zeros (32) -] | |
// | xor | |
// '----- pmull ----> [-------------------- (96) --------------------] | |
// = | |
// [------------------ message96 -----------------] | |
constexpr poly64x2_t k96k64 = {k_shift<Poly>(96), k_shift<Poly>(64)}; | |
uint32x4_t message96 = | |
veorq_u8(uint64x2_t{message128[1], 0}, | |
vreinterpretq_u8_p128(vmull_p64(message128[0], k96k64[0]))); | |
// Then fold down to 64-bit: | |
// [--------------- message96 (96) ---------------] | |
// [ message96[0] ][ message96[1] ][ message96[2] ] | |
// | xor | |
// '-----> [------- (64-bit result) ------] | |
// = | |
// [---------- message64 ---------] | |
uint32x4_t rotated = vextq_u32(message96, message96, 1); | |
message96[2] = message96[0]; | |
uint64x2_t message64 = veorq_u8( | |
rotated, vreinterpretq_u8_p128(vmull_high_p64(message96, k96k64))); | |
// message64 = x^32 • M(x), so now we just need to compute message64 mod | |
// P(x). | |
// | |
// The Barrett reduction, for unsigned integers, using C-like (flooring) | |
// division is: | |
// | |
// x % N = x - (x / N) * N | |
// | |
// As N is constant, this can use reciprocal division instead. The same | |
// general idea applies to our polynomials: | |
constexpr poly64x2_t u = {x_to_n_div_p<Poly>(64) >> 31, 0}; | |
uint32x4_t t1 = vreinterpretq_u8_p128(vmull_p64(message64[0], u[0])); | |
t1[1] = 0; | |
constexpr poly64x2_t poly = {((uint64_t)Poly << 1) | 1, 0}; | |
uint32x4_t tmp = veorq_u8( | |
vreinterpretq_u8_p128(vmull_p64(vreinterpretq_p64_u8(t1)[0], poly[0])), | |
message64); | |
result = tmp[1]; | |
} | |
while (p != end) | |
result = crc32b<Poly>(result, *p++); | |
return ~result; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can’t perform that action at this time.