Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
polynomial_arithmetic.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Planned, auditors: [], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
13#include <math.h>
14#include <memory.h>
15#include <memory>
16
18
19namespace {
20
21template <typename Fr> std::shared_ptr<Fr[]> get_scratch_space(const size_t num_elements)
22{
23 static std::shared_ptr<Fr[]> working_memory = nullptr;
24 static size_t current_size = 0;
25 if (num_elements > current_size) {
26 working_memory = std::make_shared<Fr[]>(num_elements);
27 current_size = num_elements;
28 }
29 return working_memory;
30}
31
32} // namespace
33
34inline uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
35{
36 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
37 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
38 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
39 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
40 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
41}
42
43inline bool is_power_of_two(uint64_t x)
44{
45 return x && !(x & (x - 1));
46}
47
48template <typename Fr>
50 Fr* target,
51 const EvaluationDomain<Fr>& domain,
52 const Fr& generator_start,
53 const Fr& generator_shift,
54 const size_t generator_size)
55{
56 parallel_for(domain.num_threads, [&](size_t j) {
57 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
58 Fr work_generator = generator_start * thread_shift;
59 const size_t offset = j * (generator_size / domain.num_threads);
60 const size_t end = offset + (generator_size / domain.num_threads);
61 for (size_t i = offset; i < end; ++i) {
62 target[i] = coeffs[i] * work_generator;
63 work_generator *= generator_shift;
64 }
65 });
66}
67
68template <typename Fr>
69 requires SupportsFFT<Fr>
71 Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain, const Fr&, const std::vector<Fr*>& root_table)
72{
73 parallel_for(domain.num_threads, [&](size_t j) {
74 Fr temp_1;
75 Fr temp_2;
76 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
77 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
78 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
79 __builtin_prefetch(&coeffs[next_index_1]);
80 __builtin_prefetch(&coeffs[next_index_2]);
81
82 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
83 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
84
85 Fr::__copy(coeffs[swap_index_1], temp_1);
86 Fr::__copy(coeffs[swap_index_2], temp_2);
87 target[i + 1] = temp_1 - temp_2;
88 target[i] = temp_1 + temp_2;
89 }
90 });
91
92 // outer FFT loop
93 for (size_t m = 2; m < (domain.size); m <<= 1) {
94 parallel_for(domain.num_threads, [&](size_t j) {
95 Fr temp;
96
97 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
98 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
99 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
100 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
101
102 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
103 // directly access the roots of unity lookup table
104 const size_t start = j * (domain.thread_size >> 1);
105 const size_t end = (j + 1) * (domain.thread_size >> 1);
106
107 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
108 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
109 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
110 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
111 // roots of unity
112 // i.e. each successive FFT round will double the set of roots that we need to index.
113 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
114 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
115 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
116 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
117 // iterations, we loop back to the start.
118
119 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
120 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
121 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
122 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
123 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
124 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
125 const size_t block_mask = m - 1;
126
127 // The next problem to tackle, is we now need to efficiently index the polynomial element in
128 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
129 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
130 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
131 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
132 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
133 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
134 // block_mask)`
135 const size_t index_mask = ~block_mask;
136
137 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
138 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
139 // all 1).
140 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
141
142 // Finally, we want to treat the final round differently from the others,
143 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
144 // `scratch_space`
145 for (size_t i = start; i < end; ++i) {
146 size_t k1 = (i & index_mask) << 1;
147 size_t j1 = i & block_mask;
148 temp = round_roots[j1] * target[k1 + j1 + m];
149 target[k1 + j1 + m] = target[k1 + j1] - temp;
150 target[k1 + j1] += temp;
151 }
152 });
153 }
154}
155
156template <typename Fr>
157 requires SupportsFFT<Fr>
158void ifft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
159{
160 fft_inner_parallel(coeffs, target, domain, domain.root_inverse, domain.get_inverse_round_roots());
162 target[i] *= domain.domain_inverse;
164}
165
166template <typename Fr> Fr evaluate(const Fr* coeffs, const Fr& z, const size_t n)
167{
168 const size_t num_threads = get_num_cpus();
169 std::vector<Fr> evaluations(num_threads, Fr::zero());
170 parallel_for([&](const ThreadChunk& chunk) {
171 // parallel_for with ThreadChunk uses get_num_cpus() threads
172 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
173 auto range = chunk.range(n);
174 if (range.empty()) {
175 return;
176 }
177 size_t start = *range.begin();
178 Fr z_acc = z.pow(static_cast<uint64_t>(start));
179 for (size_t i : range) {
180 Fr work_var = z_acc * coeffs[i];
181 evaluations[chunk.thread_index] += work_var;
182 z_acc *= z;
183 }
184 });
185
186 Fr r = Fr::zero();
187 for (const auto& eval : evaluations) {
188 r += eval;
189 }
190 return r;
191}
192
193template <typename Fr> Fr evaluate(const std::vector<Fr*> coeffs, const Fr& z, const size_t large_n)
194{
195 const size_t num_polys = coeffs.size();
196 const size_t poly_size = large_n / num_polys;
197 BB_ASSERT(is_power_of_two(poly_size));
198 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
199 const size_t num_threads = get_num_cpus();
200 std::vector<Fr> evaluations(num_threads, Fr::zero());
201 parallel_for([&](const ThreadChunk& chunk) {
202 // parallel_for with ThreadChunk uses get_num_cpus() threads
203 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
204 auto range = chunk.range(large_n);
205 if (range.empty()) {
206 return;
207 }
208 size_t start = *range.begin();
209 Fr z_acc = z.pow(static_cast<uint64_t>(start));
210 for (size_t i : range) {
211 Fr work_var = z_acc * coeffs[i >> log2_poly_size][i & (poly_size - 1)];
212 evaluations[chunk.thread_index] += work_var;
213 z_acc *= z;
214 }
215 });
216
217 Fr r = Fr::zero();
218 for (const auto& eval : evaluations) {
219 r += eval;
220 }
221 return r;
222}
223
224// This function computes sum of all scalars in a given array.
225template <typename Fr> Fr compute_sum(const Fr* src, const size_t n)
226{
227 Fr result = 0;
228 for (size_t i = 0; i < n; ++i) {
229 result += src[i];
230 }
231 return result;
232}
233
234// This function computes the polynomial (x - a)(x - b)(x - c)... given n distinct roots (a, b, c, ...).
235template <typename Fr> void compute_linear_polynomial_product(const Fr* roots, Fr* dest, const size_t n)
236{
237
238 auto scratch_space_ptr = get_scratch_space<Fr>(n);
239 auto scratch_space = scratch_space_ptr.get();
240 memcpy((void*)scratch_space, (void*)roots, n * sizeof(Fr));
241
242 dest[n] = 1;
243 dest[n - 1] = -compute_sum(scratch_space, n);
244
245 Fr temp;
246 Fr constant = 1;
247 for (size_t i = 0; i < n - 1; ++i) {
248 temp = 0;
249 for (size_t j = 0; j < n - 1 - i; ++j) {
250 scratch_space[j] = roots[j] * compute_sum(&scratch_space[j + 1], n - 1 - i - j);
251 temp += scratch_space[j];
252 }
253 dest[n - 2 - i] = temp * constant;
254 constant *= Fr::neg_one();
255 }
256}
257
258template <typename Fr> Fr compute_linear_polynomial_product_evaluation(const Fr* roots, const Fr z, const size_t n)
259{
260 Fr result = 1;
261 for (size_t i = 0; i < n; ++i) {
262 result *= (z - roots[i]);
263 }
264 return result;
265}
266
267template <typename Fr>
268void compute_efficient_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
269{
270 /*
271 We use Lagrange technique to compute polynomial interpolation.
272 Given: (x_i, y_i) for i ∈ {0, 1, ..., n} =: [n]
273 Compute function f(X) such that f(x_i) = y_i for all i ∈ [n].
274 (X - x1)(X - x2)...(X - xn) (X - x0)(X - x2)...(X - xn)
275 F(X) = y0-------------------------------- + y1---------------------------------- + ...
276 (x0 - x_1)(x0 - x_2)...(x0 - xn) (x1 - x_0)(x1 - x_2)...(x1 - xn)
277 We write this as:
278 [ yi ]
279 F(X) = N(X) * |∑_i --------------- |
280 [ (X - xi) * di ]
281 where:
282 N(X) = ∏_{i \in [n]} (X - xi),
283 di = ∏_{j != i} (xi - xj)
284 For division of N(X) by (X - xi), we use the same trick that was used in compute_opening_polynomial()
285 function in the Kate commitment scheme.
286 We denote
287 q_{x_i} = N(X)/(X-x_i) * y_i * (d_i)^{-1} = q_{x_i,0}*1 + ... + q_{x_i,n-1} * X^{n-1} for i=0,..., n-1.
288
289 The computation of F(X) is split into two cases:
290
291 - if 0 is not in the interpolation domain, then the numerator polynomial N(X) has a non-zero constant term
292 that is used to initialize the division algorithm mentioned above; the monomial coefficients q_{x_i, j} of
293 q_{x_i} are accumulated into dest[j] for j=0,..., n-1
294
295 - if 0 is in the domain at index i_0, the constant term of N(X) is 0 and the division algorithm computing
296 q_{x_i} for i != i_0 is initialized with the constant term of N(X)/X. Note that its coefficients are given
297 by numerator_polynomial[j] for j=1,...,n. The monomial coefficients of q_{x_i} are then accumuluated in
298 dest[j] for j=1,..., n-1. Whereas the coefficients of
299 q_{0} = N(X)/X * f(0) * (d_{i_0})^{-1}
300 are added to dest[j] for j=0,..., n-1. Note that these coefficients do not require performing the division
301 algorithm used in Kate commitment scheme, as the coefficients of N(X)/X are given by numerator_polynomial[j]
302 for j=1,...,n.
303 */
304 std::vector<Fr> numerator_polynomial(n + 1);
305 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial.data(), n);
306 // First half contains roots, second half contains denominators (to be inverted)
307 std::vector<Fr> roots_and_denominators(2 * n);
308 std::vector<Fr> temp_src(n);
309 for (size_t i = 0; i < n; ++i) {
310 roots_and_denominators[i] = -evaluation_points[i];
311 temp_src[i] = src[i];
312 dest[i] = 0;
313 // compute constant denominators
314 roots_and_denominators[n + i] = 1;
315 for (size_t j = 0; j < n; ++j) {
316 if (j == i) {
317 continue;
318 }
319 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
320 }
321 }
322 // at this point roots_and_denominators is populated as follows
323 // (x_0,\ldots, x_{n-1}, d_0, \ldots, d_{n-1})
324 Fr::batch_invert(roots_and_denominators.data(), 2 * n);
325
326 Fr z, multiplier;
327 std::vector<Fr> temp_dest(n);
328 size_t idx_zero = 0;
329 bool interpolation_domain_contains_zero = false;
330 // if the constant term of the numerator polynomial N(X) is 0, then the interpolation domain contains 0
331 // we find the index i_0, such that x_{i_0} = 0
332 if (numerator_polynomial[0] == Fr(0)) {
333 for (size_t i = 0; i < n; ++i) {
334 if (evaluation_points[i] == Fr(0)) {
335 idx_zero = i;
336 interpolation_domain_contains_zero = true;
337 break;
338 }
339 }
340 };
341
342 if (!interpolation_domain_contains_zero) {
343 for (size_t i = 0; i < n; ++i) {
344 // set z = - 1/x_i for x_i <> 0
345 z = roots_and_denominators[i];
346 // temp_src[i] is y_i, it gets multiplied by 1/d_i
347 multiplier = temp_src[i] * roots_and_denominators[n + i];
348 temp_dest[0] = multiplier * numerator_polynomial[0];
349 temp_dest[0] *= z;
350 dest[0] += temp_dest[0];
351 for (size_t j = 1; j < n; ++j) {
352 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
353 temp_dest[j] *= z;
354 dest[j] += temp_dest[j];
355 }
356 }
357 } else {
358 for (size_t i = 0; i < n; ++i) {
359 if (i == idx_zero) {
360 // the contribution from the term corresponding to i_0 is computed separately
361 continue;
362 }
363 // get the next inverted root
364 z = roots_and_denominators[i];
365 // compute f(x_i) * d_{x_i}^{-1}
366 multiplier = temp_src[i] * roots_and_denominators[n + i];
367 // get x_i^{-1} * f(x_i) * d_{x_i}^{-1} into the "free" term
368 temp_dest[1] = multiplier * numerator_polynomial[1];
369 temp_dest[1] *= z;
370 // correct the first coefficient as it is now accumulating free terms from
371 // f(x_i) d_i^{-1} prod_(X-x_i, x_i != 0) (X-x_i) * 1/(X-x_i)
372 dest[1] += temp_dest[1];
373 // compute the quotient N(X)/(X-x_i) f(x_i)/d_{x_i} and its contribution to the target coefficients
374 for (size_t j = 2; j < n; ++j) {
375 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
376 temp_dest[j] *= z;
377 dest[j] += temp_dest[j];
378 };
379 }
380 // correct the target coefficients by the contribution from q_{0} = N(X)/X * d_{i_0}^{-1} * f(0)
381 for (size_t i = 0; i < n; ++i) {
382 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
383 }
384 }
385}
386
387template fr evaluate<fr>(const fr*, const fr&, const size_t);
388template fr evaluate<fr>(const std::vector<fr*>, const fr&, const size_t);
389template void fft_inner_parallel<fr>(fr*, fr*, const EvaluationDomain<fr>&, const fr&, const std::vector<fr*>&);
390template void ifft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
391template fr compute_sum<fr>(const fr*, const size_t);
392template void compute_linear_polynomial_product<fr>(const fr*, fr*, const size_t);
393template void compute_efficient_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
394
395template grumpkin::fr evaluate<grumpkin::fr>(const grumpkin::fr*, const grumpkin::fr&, const size_t);
396template grumpkin::fr evaluate<grumpkin::fr>(const std::vector<grumpkin::fr*>, const grumpkin::fr&, const size_t);
397template grumpkin::fr compute_sum<grumpkin::fr>(const grumpkin::fr*, const size_t);
398template void compute_linear_polynomial_product<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, const size_t);
399template void compute_efficient_interpolation<grumpkin::fr>(const grumpkin::fr*,
401 const grumpkin::fr*,
402 const size_t);
403
404} // namespace bb::polynomial_arithmetic
#define BB_ASSERT(expression,...)
Definition assert.hpp:70
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83
const std::vector< FF * > & get_inverse_round_roots() const
#define ITERATE_OVER_DOMAIN_START(domain)
#define ITERATE_OVER_DOMAIN_END
constexpr bool is_power_of_two(uint64_t x)
Definition pow.hpp:35
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void ifft(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
void fft_inner_parallel(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::ScalarField Fr
size_t total_threads
Definition thread.hpp:151
size_t thread_index
Definition thread.hpp:150
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152
static constexpr field neg_one()
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
static void batch_invert(C &coeffs) noexcept
Batch invert a collection of field elements using Montgomery's trick.
static constexpr field zero()