aoc/year2018/day14.rs
1//! # Chocolate Charts
2//!
3//! This solution is heavily inspired by [Askalski's](https://www.reddit.com/user/askalski/)
4//! excellent post [Breaking the 1 billion recipes per second barrier](https://www.reddit.com/r/adventofcode/comments/a6wpwa/2018_day_14_breaking_the_1_billion_recipes_per/)
5//!
6//! The key insight is that after 23 recipes the elves converge into using the *same subset* of
7//! recipes. This subset can be stored compactly in about 20% of the space and read sequentially
8//! to allow efficient vector processing.
9//!
10//! Tricks used to speed things up:
11//! * Separate writer and reader threads to generate recipes and check them in parallel.
12//! * Vector processing of recipes using techniques similar to SIMD.
13use crate::util::parse::*;
14use std::sync::atomic::{AtomicBool, Ordering};
15use std::sync::mpsc::{Receiver, Sender, channel};
16use std::thread;
17
18type Input = (String, usize);
19
20/// Pre-calculate the first 23 recipes.
21const PREFIX: [u8; 23] = [3, 7, 1, 0, 1, 0, 1, 2, 4, 5, 1, 5, 8, 9, 1, 6, 7, 7, 9, 2, 5, 1, 0];
22
23pub fn parse(input: &str) -> Input {
24    // Send batches of recipes from the writer to the reader for checking.
25    let (tx, rx) = channel();
26    // Thread safe flag to let writer know when to stop.
27    let done = AtomicBool::new(false);
28    // Store recipes in fixed size vec prefilled with ones. Part two result is around 20 million
29    // so size should be sufficient for most inputs.
30    let mut recipes = vec![1; 25_000_000];
31
32    thread::scope(|scope| {
33        // Start writer thread to produce new recipes.
34        scope.spawn(|| writer(tx, &done, recipes.as_mut_slice()));
35        // Reader thread checks recipes for the answers, returning when both parts are found.
36        scope.spawn(|| reader(rx, &done, input)).join().unwrap()
37    })
38}
39
40pub fn part1(input: &Input) -> &str {
41    &input.0
42}
43
44pub fn part2(input: &Input) -> usize {
45    input.1
46}
47
48/// Receives batches of recipes from the writer thread, then scans them byte by byte searching
49/// for the part two pattern. For simplicity the pattern is always assumed to by six digits.
50fn reader(rx: Receiver<&[u8]>, done: &AtomicBool, input: &str) -> (String, usize) {
51    let part_one_target = input.unsigned::<usize>() + 10;
52    let part_two_target = u32::from_str_radix(input.trim(), 16).unwrap();
53
54    let mut part_one_result = None;
55    let mut part_two_result = None;
56
57    let mut history = Vec::new();
58    let mut total = 0;
59    let mut pattern = 0;
60
61    for slice in rx {
62        history.push(slice);
63        total += slice.len();
64
65        // The recipes are broken up into batches. Even though these batches originally come
66        // from the same contiguous slice, this thread has no way to know that or reassemble
67        // the original. The result could potentially be split over two or more slices.
68        if part_one_result.is_none() && total >= part_one_target {
69            let mut index = 0;
70            let mut offset = part_one_target - 10;
71            let mut result = String::new();
72
73            for _ in 0..10 {
74                // If we go past the end of a slice then check the next one.
75                while offset >= history[index].len() {
76                    offset -= history[index].len();
77                    index += 1;
78                }
79
80                // Push each digit into a string as there could be leading zeroes.
81                let digit = history[index][offset];
82                result.push((digit + b'0') as char);
83                offset += 1;
84            }
85
86            part_one_result = Some(result);
87        }
88
89        // Simple brute force pattern matching. Slices are received in order so the pattern will
90        // handle cases when the target is split between two slices.
91        if part_two_result.is_none() {
92            for (i, n) in slice.iter().copied().enumerate() {
93                pattern = ((pattern << 4) | (n as u32)) & 0xffffff;
94
95                if pattern == part_two_target {
96                    part_two_result = Some(total - slice.len() + i - 5);
97                    break;
98                }
99            }
100        }
101
102        // Signal the writer thread to finish once both results are found.
103        if part_one_result.is_some() && part_two_result.is_some() {
104            done.store(true, Ordering::Relaxed);
105            break;
106        }
107    }
108
109    (part_one_result.unwrap(), part_two_result.unwrap())
110}
111
112/// Generates recipes then sends them to the reader thread for checking in batches.
113/// Processing is broken into alternating "cold" and "hot" loops. An outer enclosing loop checks
114/// periodically for the done signal from the reader thread.
115///
116/// The "cold" loop processes recipes serially one by one but can handle input corner cases.
117/// It's used when either:
118/// * One or both elves are within the first 23 recipes.
119/// * One or both elves are within the last 16 recipes.
120///
121/// The "hot" loop processes recipes efficiently in chunks of 16. The vast majority of recipes
122/// are calculated in this loop. As much as possible is parallelized using techniques similar to
123/// SIMD but using regular instructions instead of SIMD intrinsics or Rust's portable SIMD API.
124///
125/// Interestingly on an Apple M2 Max this "poor man's SIMD" has the same performance as using
126/// the portable SIMD API. This is probably due to the fact that the serial loops that write new
127/// recipes take the majority of the time.
128fn writer<'a>(tx: Sender<&'a [u8]>, done: &AtomicBool, mut recipes: &'a mut [u8]) {
129    // The first 23 recipes have already been generated
130    // so the elves start at position 0 and 8 respectively.
131    let mut elf1 = 0;
132    let mut index1 = 0;
133
134    let mut elf2 = 8;
135    let mut index2 = 0;
136
137    let mut base = 0;
138    let mut size = 23;
139    let mut needed = 23;
140
141    // Store the smaller subset of recipes used by the elves.
142    let mut write = 0;
143    let mut snack: Vec<u8> = vec![0; 5_000_000];
144
145    while !done.load(Ordering::Relaxed) {
146        // Cold loop to handle start and end transitions.
147        while elf1 < 23 || elf2 < 23 || write - index1.max(index2) <= 16 {
148            // After the first 23 recipes both elves converge on the same set of ingredients.
149            let recipe1 = if elf1 < 23 {
150                PREFIX[elf1]
151            } else {
152                index1 += 1;
153                snack[index1 - 1]
154            };
155
156            let recipe2 = if elf2 < 23 {
157                PREFIX[elf2]
158            } else {
159                index2 += 1;
160                snack[index2 - 1]
161            };
162
163            // Add next recipe.
164            let next = recipe1 + recipe2;
165            if next < 10 {
166                recipes[size - base] = next;
167                size += 1;
168            } else {
169                recipes[size - base + 1] = next - 10;
170                size += 2;
171            }
172
173            if needed < size {
174                let digit = recipes[needed - base];
175                needed += 1 + digit as usize;
176
177                snack[write] = digit;
178                write += 1;
179            }
180
181            // Wrap around to start if necessary.
182            elf1 += 1 + recipe1 as usize;
183            if elf1 >= size {
184                elf1 -= size;
185                index1 = 0;
186            }
187
188            elf2 += 1 + recipe2 as usize;
189            if elf2 >= size {
190                elf2 -= size;
191                index2 = 0;
192            }
193        }
194
195        // Hot loop to handle the majority of recipes in the middle. Process at most 10,000 recipes
196        // at a time in order to produce batches between 160,000 and 320,000 bytes in size.
197        // This size is roughly tuned in order to maximize reader thread throughput.
198        let batch_size = 10_000.min((write - index1.max(index2) - 1) / 16);
199
200        for _ in 0..batch_size {
201            // Snacks can be processed sequentially.
202            let first = from_be_bytes(&snack, index1);
203            let second = from_be_bytes(&snack, index2);
204            let third = from_be_bytes(&snack, index1 + 8);
205            let fourth = from_be_bytes(&snack, index2 + 8);
206
207            // Each elf will skip forward between 16 and 32 snacks.
208            elf1 += 16 + lsb(prefix_sum(first)) + lsb(prefix_sum(third));
209            elf2 += 16 + lsb(prefix_sum(second)) + lsb(prefix_sum(fourth));
210            index1 += 16;
211            index2 += 16;
212
213            // Process the digits in parallel using techniques similar to SIMD.
214            let (digits1, indices1, extra1) = unpack(first, second);
215            let (digits2, indices2, extra2) = unpack(third, fourth);
216
217            // Scatter each digit into the correct location, leaving "holes" where ones should go.
218            // This is handled correctly by prefilling `recipes`` with ones when initializing.
219            for shift in (0..64).step_by(8) {
220                let digit = lsb(digits1 >> shift);
221                let index = lsb(indices1 >> shift);
222                recipes[size - base + index] = digit as u8;
223
224                let digit = lsb(digits2 >> shift);
225                let index = lsb(indices2 >> shift);
226                recipes[size - base + index + extra1] = digit as u8;
227            }
228
229            size += extra1 + extra2;
230
231            // Write the recipes that will actually be used in subsequent loops to a smaller
232            // contiguous vec.
233            while needed < size {
234                let digit = recipes[needed - base];
235                needed += 1 + digit as usize;
236
237                snack[write] = digit;
238                write += 1;
239            }
240        }
241
242        // Split the mutable `recipes` slice into two parts. This allows the reader thread to
243        // access the head in parallel while the reader thread continues to write to the tail,
244        // ensuring unique ownership of each part of memory to prevent any concurrency issues.
245        let (head, tail) = recipes.split_at_mut(size - base);
246        let _unused = tx.send(head);
247        recipes = tail;
248        base = size;
249    }
250
251    // Drop the sender to make the receiver hang up.
252    drop(tx);
253}
254
255/// Convert 8 bytes in [big endian order](https://en.wikipedia.org/wiki/Endianness) into a `usize`.
256#[inline]
257fn from_be_bytes(slice: &[u8], index: usize) -> usize {
258    usize::from_be_bytes(slice[index..index + 8].try_into().unwrap())
259}
260
261/// Convenience function that returns least significant byte.
262#[inline]
263fn lsb(u: usize) -> usize {
264    u & 0xff
265}
266
267/// Compute the prefix sum of each byte within a `usize`. Let `a..h` denote the bytes from most
268/// significant to least significant and `Σx..y` denote the sum from `x` to `y` inclusive.
269///
270/// ```none
271///     s               |   a   |   b   |   c   |   d   |   e   |   f   |   g   |   h   |
272///     s += (s >> 8)   |   a   | Σa..b | Σb..c | Σc..d | Σd..e | Σe..f | Σf..g | Σg..h |
273///     s += (s >> 16)  |   a   | Σa..b | Σa..c | Σa..d | Σb..e | Σc..f | Σd..g | Σe..h |
274///     s += (s >> 32)  |   a   | Σa..b | Σa..c | Σa..d | Σa..e | Σa..f | Σa..g | Σa..h |
275/// ```
276#[inline]
277fn prefix_sum(u: usize) -> usize {
278    let mut s = u;
279    s += s >> 8;
280    s += s >> 16;
281    s += s >> 32;
282    s
283}
284
285/// Takes two groups of 8 digits each packed into a `usize` as input, then returns the output
286/// digits and their respective locations. Ones from sums greater than ten are implicit and not
287/// included since recipes has already been pre-filled with ones.
288#[inline]
289fn unpack(first: usize, second: usize) -> (usize, usize, usize) {
290    const ONES: usize = 0x0101010101010101;
291    const SIXES: usize = 0x0606060606060606;
292    const INDICES: usize = 0x0001020304050607;
293
294    // Example values, showing each byte in a columm:
295    //
296    // first      | 04 | 01 | 09 | 08 | 00 | 03 | 05 | 07 |
297    // second     | 03 | 00 | 02 | 04 | 09 | 06 | 05 | 01 |
298    // sum        | 07 | 01 | 0b | 0c | 09 | 09 | 0a | 08 |
299    let sum = first + second;
300
301    // Add 6 to each byte so that sums greater than or equal to ten become greater than or equal
302    // to 16, setting the first bit in the high nibble of each byte.
303    //
304    // sum        | 07 | 01 | 0b | 0c | 09 | 09 | 0a | 08 |
305    // SIXES      | 06 | 06 | 06 | 06 | 06 | 06 | 06 | 06 |
306    // total      | 0d | 07 | 11 | 12 | 0f | 0f | 10 | 0e |
307    // tens       | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
308    let tens = ((sum + SIXES) >> 4) & ONES;
309
310    // Multiply by 10 to "spread" a 10 into each byte that has a total greater than 10.
311    //
312    // tens       | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
313    // tens * 10  | 00 | 00 | 0a | 0a | 00 | 00 | 0a | 00 |
314    // digits     | 07 | 01 | 01 | 02 | 09 | 09 | 00 | 08 |
315    let digits = sum - 10 * tens;
316
317    // Columns greater than 10 will takes 2 bytes when written to recipes. Each index is
318    // offset by the number of 10s before it. Adding the normal increase indices gives the
319    // final location of each byte.
320    //
321    // tens       | 00 | 00 | 01 | 01 | 00 | 00 | 01 | 00 |
322    // prefix sum | 00 | 00 | 01 | 02 | 02 | 02 | 03 | 03 |
323    // INDICES    | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 |
324    // indices    | 00 | 02 | 03 | 05 | 06 | 07 | 09 | 0a |
325    let indices = prefix_sum(tens) + INDICES;
326
327    // The total number of bytes that need to be written is one plus the last index.
328    let extra = 1 + lsb(indices);
329
330    (digits, indices, extra)
331}