aoc/year2017/day15.rs
1//! # Dueling Generators
2//!
3//! Multithreaded approach using worker threads to generate batches of numbers for judging.
4//! Part one can be checked in parallel, but part two must be sent to a single thread as the
5//! indices must be checked in order.
6//!
7//! The sequence of numbers are [modular exponentiation](https://en.wikipedia.org/wiki/Modular_exponentiation)
8//! so we can jump to any location in the sequence, without needing to know the previous numbers.
9//!
10//! The generator is in the hot path, so anything we can do to make it run faster is worthwhile;
11//! start by observing that our divisor 0x7fffffff is of the form `2ᵏ - 1`, which lends itself
12//! well to computing a remainder with less work than a hardware division (the analysis here works
13//! for any number adjacent to a power of two, not just Mersenne primes). At a high level,
14//! computing `X % Y` is the same as repeatedly subtracting `Y` from a starting point of `X` until
15//! reaching a value less than `Y`. How many times does that subtraction occur? That's easy,
16//! `X / Y`. But when dividing by `Y` is expensive (a hardware division by an odd number takes
17//! multiple clock cycles), what if we divide by `Y + 1` instead (dividing by 2ᵏ is just
18//! performing a bit mask). Conceptually, the remainder after each subtraction of `Y + 1`
19//! grows by an error of one until we reach a remainder of `X % (Y + 1)` - but we know the total
20//! error: it was the number of times we subtracted the denominator, or `X / (Y + 1)`; and
21//! that value is also available, with just a bit shift. Adding the 31-bit adjusted remainder
22//! with the 31-bit error can overflow to 32 bits; then a final comparison against `MOD` gets
23//! the correct answer in `fast_mod` faster than hardware division. See also
24//! [this post](https://www.reddit.com/r/adventofcode/comments/7jxkiw/comment/drazokj/).
25use crate::util::hash::*;
26use crate::util::iter::*;
27use crate::util::math::*;
28use crate::util::parse::*;
29use crate::util::thread::*;
30use std::sync::mpsc::{Receiver, Sender, channel};
31use std::thread;
32
33const MOD: usize = 0x7fffffff;
34const PART_ONE: usize = 40_000_000;
35const PART_TWO: usize = 5_000_000;
36const BLOCK: usize = 50_000;
37
38type Input = (usize, usize);
39
40/// State shared between all threads.
41pub struct Shared {
42 first: usize,
43 second: usize,
44 iter: AtomicIter,
45}
46
47/// Generated numbers from `start` to `start + BLOCK`.
48struct Block {
49 start: usize,
50 ones: usize,
51 fours: Vec<u16>,
52 eights: Vec<u16>,
53}
54
55pub fn parse(input: &str) -> Input {
56 let [first, second] = input.iter_unsigned().chunk::<2>().next().unwrap();
57 let shared = Shared { first, second, iter: AtomicIter::new(0, BLOCK as u32) };
58 let (tx, rx) = channel();
59
60 thread::scope(|scope| {
61 // Use all cores except one to generate blocks of numbers for judging.
62 for _ in 0..threads() - 1 {
63 scope.spawn(|| sender(&shared, &tx));
64 }
65 // Judge batches serially.
66 receiver(&shared, &rx)
67 })
68}
69
70pub fn part1(input: &Input) -> usize {
71 input.0
72}
73
74pub fn part2(input: &Input) -> usize {
75 input.1
76}
77
78fn sender(shared: &Shared, tx: &Sender<Block>) {
79 while let Some(start) = shared.iter.next() {
80 // Start at any point in the sequence using modular exponentiation.
81 let start = start as usize;
82 let mut first = shared.first * 16807.mod_pow(start, MOD);
83 let mut second = shared.second * 48271.mod_pow(start, MOD);
84
85 // Estimate capacity at one quarter or one eighth.
86 let mut ones = 0;
87 let mut fours = Vec::with_capacity(BLOCK / 4);
88 let mut eights = Vec::with_capacity(BLOCK / 8);
89
90 // Check part one pairs immediately while queueing part two pairs.
91 for _ in 0..BLOCK {
92 first = fast_mod(first * 16807);
93 second = fast_mod(second * 48271);
94
95 let left = first as u16;
96 let right = second as u16;
97
98 if left == right {
99 ones += 1;
100 }
101 if left.is_multiple_of(4) {
102 fours.push(left);
103 }
104 if right.is_multiple_of(8) {
105 eights.push(right);
106 }
107 }
108
109 let _unused = tx.send(Block { start, ones, fours, eights });
110 }
111}
112
113fn receiver(shared: &Shared, rx: &Receiver<Block>) -> Input {
114 let mut required = 0;
115 let mut out_of_order = FastMap::new();
116
117 let mut fours = Vec::with_capacity(PART_TWO + BLOCK);
118 let mut eights = Vec::with_capacity(PART_TWO + BLOCK);
119 let mut start = 0;
120
121 let mut part_one = 0;
122 let mut part_two = 0;
123
124 while required < PART_ONE || fours.len() < PART_TWO || eights.len() < PART_TWO {
125 // Blocks could be received in any order, as there's no guarantee threads will finish
126 // processing at the same time. The `start` field of the block defines the order they
127 // must be added to the vec.
128 while let Ok(block) = rx.try_recv() {
129 out_of_order.insert(block.start, block);
130 }
131
132 while let Some(block) = out_of_order.remove(&required) {
133 required += BLOCK;
134
135 if required <= PART_ONE {
136 part_one += block.ones;
137 }
138
139 if fours.len() < PART_TWO {
140 fours.extend_from_slice(&block.fours);
141 }
142
143 if eights.len() < PART_TWO {
144 eights.extend_from_slice(&block.eights);
145 }
146
147 let end = PART_TWO.min(fours.len()).min(eights.len());
148 part_two +=
149 fours[start..end].iter().zip(&eights[start..end]).filter(|(a, b)| a == b).count();
150 start = end;
151 }
152 }
153
154 // Signal worker threads to finish.
155 shared.iter.stop();
156
157 (part_one, part_two)
158}
159
160/// Fast computation of n % 0x7fffffff.
161#[inline]
162fn fast_mod(n: usize) -> usize {
163 let low = n & MOD;
164 let high = n >> 31;
165 let sum = low + high;
166 if sum < MOD { sum } else { sum - MOD }
167}