aoc/year2018/day14.rs
1//! # Chocolate Charts
2//!
3//! This solution is heavily inspired by [Askalski's](https://www.reddit.com/user/askalski/)
4//! excellent post [Breaking the 1 billion recipes per second barrier](https://www.reddit.com/r/adventofcode/comments/a6wpwa/2018_day_14_breaking_the_1_billion_recipes_per/)
5//!
6//! The key insight is that after 23 recipes the elves converge into using the *same subset* of
7//! recipes. This subset can be stored compactly in about 20% of the space and read sequentially
8//! to allow efficient vector processing.
9//!
10//! Tricks used to speed things up:
11//! * Separate writer and reader threads to generate recipes and check them in parallel.
12//! * Vector processing of recipes using techniques similar to SIMD.
13use crate::util::parse::*;
14use std::sync::atomic::{AtomicBool, Ordering};
15use std::sync::mpsc::{Receiver, Sender, channel};
16use std::thread;
17
18type Input = (String, usize);
19
20/// Pre-calculate the first 23 recipes.
21const PREFIX: [u8; 23] = [3, 7, 1, 0, 1, 0, 1, 2, 4, 5, 1, 5, 8, 9, 1, 6, 7, 7, 9, 2, 5, 1, 0];
22
23pub fn parse(input: &str) -> Input {
24 // Send batches of recipes from the writer to the reader for checking.
25 let (tx, rx) = channel();
26 // Thread safe flag to let writer know when to stop.
27 let done = AtomicBool::new(false);
28 // Store recipes in fixed size vec prefilled with ones. Part two result is around 20 million
29 // so size should be sufficient for most inputs.
30 let mut recipes = vec![1; 25_000_000];
31
32 thread::scope(|scope| {
33 // Start writer thread to produce new recipes.
34 scope.spawn(|| writer(tx, &done, recipes.as_mut_slice()));
35 // Reader thread checks recipes for the answers, returning when both parts are found.
36 scope.spawn(|| reader(rx, &done, input)).join().unwrap()
37 })
38}
39
40pub fn part1(input: &Input) -> &str {
41 &input.0
42}
43
44pub fn part2(input: &Input) -> usize {
45 input.1
46}
47
48/// Receives batches of recipes from the writer thread, then scans them byte by byte searching
49/// for the part two pattern. For simplicity the pattern is always assumed to by six digits.
50fn reader(rx: Receiver<&[u8]>, done: &AtomicBool, input: &str) -> (String, usize) {
51 let part_one_target = input.unsigned::<usize>() + 10;
52 let part_two_target = u32::from_str_radix(input.trim(), 16).unwrap();
53
54 let mut part_one_result = None;
55 let mut part_two_result = None;
56
57 let mut history = Vec::new();
58 let mut total = 0;
59 let mut pattern = 0;
60
61 for slice in rx {
62 history.push(slice);
63 total += slice.len();
64
65 // The recipes are broken up into batches. Even though these batches originally come
66 // from the same contiguous slice, this thread has no way to know that or reassemble
67 // the original. The result could potentially be split over two or more slices.
68 if part_one_result.is_none() && total >= part_one_target {
69 let mut index = 0;
70 let mut offset = part_one_target - 10;
71 let mut result = String::new();
72
73 for _ in 0..10 {
74 // If we go past the end of a slice then check the next one.
75 while offset >= history[index].len() {
76 offset -= history[index].len();
77 index += 1;
78 }
79
80 // Push each digit into a string as there could be leading zeroes.
81 let digit = history[index][offset];
82 result.push((digit + b'0') as char);
83 offset += 1;
84 }
85
86 part_one_result = Some(result);
87 }
88
89 // Simple brute force pattern matching. Slices are received in order so the pattern will
90 // handle cases when the target is split between two slices.
91 if part_two_result.is_none() {
92 for (i, n) in slice.iter().copied().enumerate() {
93 pattern = ((pattern << 4) | (n as u32)) & 0xffffff;
94
95 if pattern == part_two_target {
96 part_two_result = Some(total - slice.len() + i - 5);
97 break;
98 }
99 }
100 }
101
102 // Signal the writer thread to finish once both results are found.
103 if part_one_result.is_some() && part_two_result.is_some() {
104 done.store(true, Ordering::Relaxed);
105 break;
106 }
107 }
108
109 (part_one_result.unwrap(), part_two_result.unwrap())
110}
111
112/// Generates recipes then sends them to the reader thread for checking in batches.
113/// Processing is broken into alternating "cold" and "hot" loops. An outer enclosing loop checks
114/// periodically for the done signal from the reader thread.
115///
116/// The "cold" loop processes recipes serially one by one but can handle input corner cases.
117/// It's used when either:
118/// * One or both elves are within the first 23 recipes.
119/// * One or both elves are within the last 16 recipes.
120///
121/// The "hot" loop processes recipes efficiently in chunks of 16. The vast majority of recipes
122/// are calculated in this loop. As much as possible is parallelized using techniques similar to
123/// SIMD but using regular instructions instead of SIMD instrinsics or Rust's portable SIMD API.
124///
125/// Interestingly on an Apple M2 Max this "poor man's SIMD" has the same performance as using
126/// the portable SIMD API. This is probably due to the fact that the serial loops that write new
127/// recipes take the majority of the time.
128fn writer<'a>(tx: Sender<&'a [u8]>, done: &AtomicBool, mut recipes: &'a mut [u8]) {
129 // The first 23 recipes have already been generated
130 // so the elves start at position 0 and 8 respectively.
131 let mut elf1 = 0;
132 let mut index1 = 0;
133
134 let mut elf2 = 8;
135 let mut index2 = 0;
136
137 let mut base = 0;
138 let mut size = 23;
139 let mut needed = 23;
140
141 // Store the smaller subset of recipes used by the elves.
142 let mut write = 0;
143 let mut snack: Vec<u8> = vec![0; 5_000_000];
144
145 while !done.load(Ordering::Relaxed) {
146 // Cold loop to handle start and end transitions.
147 while elf1 < 23 || elf2 < 23 || write - index1.max(index2) <= 16 {
148 // After the first 23 recipes both elves converge on the same set of ingredients.
149 let recipe1 = if elf1 < 23 {
150 PREFIX[elf1]
151 } else {
152 index1 += 1;
153 snack[index1 - 1]
154 };
155
156 let recipe2 = if elf2 < 23 {
157 PREFIX[elf2]
158 } else {
159 index2 += 1;
160 snack[index2 - 1]
161 };
162
163 // Add next recipe.
164 let next = recipe1 + recipe2;
165 if next < 10 {
166 recipes[size - base] = next;
167 size += 1;
168 } else {
169 recipes[size - base + 1] = next - 10;
170 size += 2;
171 }
172
173 if needed < size {
174 let digit = recipes[needed - base];
175 needed += 1 + digit as usize;
176
177 snack[write] = digit;
178 write += 1;
179 }
180
181 // Wrap around to start if necessary.
182 elf1 += 1 + recipe1 as usize;
183 if elf1 >= size {
184 elf1 -= size;
185 index1 = 0;
186 }
187
188 elf2 += 1 + recipe2 as usize;
189 if elf2 >= size {
190 elf2 -= size;
191 index2 = 0;
192 }
193 }
194
195 // Hot loop to handle the majority of recipes in the middle. Process at most 10,000 recipes
196 // at a time in order to produce batches between 160,000 and 320,000 bytes in size.
197 // This size is roughly tuned in order to maximize reader thread throughput.
198 let batch_size = 10_000.min((write - index1.max(index2) - 1) / 16);
199
200 for _ in 0..batch_size {
201 // Snacks can be processed sequentially.
202 let first = from_be_bytes(&snack, index1);
203 let second = from_be_bytes(&snack, index2);
204 let third = from_be_bytes(&snack, index1 + 8);
205 let fourth = from_be_bytes(&snack, index2 + 8);
206
207 // Each elf will skip forward between 16 and 32 snacks.
208 elf1 += 16 + lsb(prefix_sum(first)) + lsb(prefix_sum(third));
209 elf2 += 16 + lsb(prefix_sum(second)) + lsb(prefix_sum(fourth));
210 index1 += 16;
211 index2 += 16;
212
213 // Process the digits in parallel using techniques similar to SIMD.
214 let (digits1, indices1, extra1) = unpack(first, second);
215 let (digits2, indices2, extra2) = unpack(third, fourth);
216
217 // Scatter each digit into the correct location, leaving "holes" where ones should go.
218 // This is handled correctly by prefilling `recipes`` with ones when initializing.
219 for shift in (0..64).step_by(8) {
220 let digit = lsb(digits1 >> shift);
221 let index = lsb(indices1 >> shift);
222 recipes[size - base + index] = digit as u8;
223
224 let digit = lsb(digits2 >> shift);
225 let index = lsb(indices2 >> shift);
226 recipes[size - base + index + extra1] = digit as u8;
227 }
228
229 size += extra1 + extra2;
230
231 // Write the recipes that will actually be used in subsequent loops to a smaller
232 // contiguous vec.
233 while needed < size {
234 let digit = recipes[needed - base];
235 needed += 1 + digit as usize;
236
237 snack[write] = digit;
238 write += 1;
239 }
240 }
241
242 // Split the mutable `recipes` slice into two parts. This allows the reader thread to
243 // access the head in parallel while the reader thread continues to write to the tail,
244 // ensuring unique ownership of each part of memory to prevent any concurrency issues.
245 let (head, tail) = recipes.split_at_mut(size - base);
246 let _unused = tx.send(head);
247 recipes = tail;
248 base = size;
249 }
250
251 // Drop the sender to make the receiver hang up.
252 drop(tx);
253}
254
255/// Convert 8 bytes in [big endian order](https://en.wikipedia.org/wiki/Endianness) into a `usize`.
256#[inline]
257fn from_be_bytes(slice: &[u8], index: usize) -> usize {
258 usize::from_be_bytes(slice[index..index + 8].try_into().unwrap())
259}
260
261/// Convenience function that returns least significant byte.
262#[inline]
263fn lsb(u: usize) -> usize {
264 u & 0xff
265}
266
267/// Compute the prefix sum of each byte within a `usize`. Let `a..h` denote the bytes from most
268/// significant to least significant and `Σx..y` denote the sum from `x` to `y` inclusive.
269///
270/// ```none
271/// s | a | b | c | d | e | f | g | h |
272/// s += (s >> 8) | a | Σa..b | Σb..c | Σc..d | Σd..e | Σe..f | Σf..g | Σg..h |
273/// s += (s >> 16) | a | Σa..b | Σa..c | Σa..d | Σb..e | Σc..f | Σd..g | Σe..h |
274/// s += (s >> 32) | a | Σa..b | Σa..c | Σa..d | Σa..e | Σa..f | Σa..g | Σa..h |
275/// ```
276#[inline]
277fn prefix_sum(u: usize) -> usize {
278 let mut s = u;
279 s += s >> 8;
280 s += s >> 16;
281 s += s >> 32;
282 s
283}
284
285/// Takes two groups of 8 digits each packed into a `usize` as input, then returns the output
286/// digits and their respective locations. Ones from sums greater than ten are implicit and not
287/// included since recipes has already been pre-filled with ones.
288#[inline]
289fn unpack(first: usize, second: usize) -> (usize, usize, usize) {
290 const ONES: usize = 0x0101010101010101;
291 const SIXES: usize = 0x0606060606060606;
292 const INDICES: usize = 0x0001020304050607;
293
294 // Example values, showing each byte in a columm:
295 //
296 // first | 04 | 01 | 09 | 08 | 00 | 03 | 05 | 07 |
297 // second | 03 | 00 | 02 | 04 | 09 | 06 | 05 | 01 |
298 // sum | 07 | 01 | 0b | 0c | 09 | 09 | 0a | 08 |
299 let sum = first + second;
300
301 // Add 6 to each byte so that sums greater than or equal to ten become greater than or equal
302 // to 16, setting the first bit in the high nibble of each byte.
303 //
304 // sum | 07 | 01 | 0b | 0c | 09 | 09 | 0a | 08 |
305 // SIXES | 06 | 06 | 06 | 06 | 06 | 06 | 06 | 06 |
306 // total | 0d | 07 | 11 | 12 | 0f | 0f | 10 | 0e |
307 // tens | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
308 let tens = ((sum + SIXES) >> 4) & ONES;
309
310 // Multiply by 10 to "spread" a 10 into each byte that has a total greater than 10.
311 //
312 // tens | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
313 // tens * 10 | 00 | 00 | 0a | 0a | 00 | 00 | 0a | 00 |
314 // digits | 07 | 01 | 01 | 02 | 09 | 09 | 00 | 08 |
315 let digits = sum - 10 * tens;
316
317 // Columns greater than 10 will takes 2 bytes when written to recipes. Each index is
318 // offset by the number of 10s before it. Adding the normal increase indices gives the
319 // final location of each byte.
320 //
321 // tens | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
322 // prefix sum | 00 | 00 | 01 | 02 | 02 | 02 | 03 | 03 |
323 // INDICES | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 |
324 // indices | 00 | 02 | 03 | 05 | 06 | 07 | 09 | 0a |
325 let indices = prefix_sum(tens) + INDICES;
326
327 // The total number of bytes that need to be written is one plus the last index.
328 let extra = 1 + lsb(indices);
329
330 (digits, indices, extra)
331}