Skip to main content

aoc/year2024/
day22.rs

1//! # Monkey Market
2//!
3//! Solves both parts simultaneously, parallelizing the work over multiple threads since
4//! each secret number is independent. The process of generating the next secret number is a
5//! [linear feedback shift register](https://en.wikipedia.org/wiki/Linear-feedback_shift_register).
6//! with a cycle of 2²⁴.
7//!
8//! Interestingly, this means that with some clever math it's possible to generate the `n`th number
9//! from any starting secret number with only 24 calculations. Unfortunately, this doesn't help for
10//! part two since we need to check every possible price change. However, to speed things up we can
11//! make several optimizations:
12//!
13//! * First the sequence of 4 prices is converted from -9..9 to a base 19 index of 0..19.
14//! * Whether a monkey has seen a sequence before and the total bananas for each sequence are
15//!   stored in an array. This is much faster than a `HashMap`. Using base 19 gives much better
16//!   cache locality needing only 130,321 elements, for example compared to shifting each new cost
17//!   by 5 bits and storing in an array of 2²⁰ = 1,048,576 elements. Multiplication on modern
18//!   processors is cheap (and several instructions can issue at once) but random memory access
19//!   is expensive.
20//!
21//! A SIMD variant processes 8 hashes at a time, taking about 60% of the time of the scalar version.
22//! The bottleneck is that disjoint indices must be written in sequence reducing the amount of work
23//! that can be parallelized.
24use crate::util::parse::*;
25use crate::util::thread::*;
26use implementation::*;
27
28type Input = (u64, u16);
29type Result = (u64, Vec<u16>);
30
31pub fn parse(input: &str) -> Input {
32    let result = parallel(input);
33
34    // Merge results from different threads.
35    let mut part_one = 0;
36    let mut part_two = vec![0; 130_321];
37
38    for (first, second) in result {
39        part_one += first;
40        part_two.iter_mut().zip(second).for_each(|(a, b)| *a += b);
41    }
42
43    (part_one, part_two.into_iter().max().unwrap())
44}
45
46pub fn part1(input: &Input) -> u64 {
47    input.0
48}
49
50pub fn part2(input: &Input) -> u16 {
51    input.1
52}
53
54#[cfg(not(feature = "simd"))]
55mod implementation {
56    use super::*;
57
58    // Use as many cores as possible to parallelize the remaining search.
59    pub(super) fn parallel(input: &str) -> Vec<Result> {
60        let numbers: Vec<_> = input.iter_unsigned().collect();
61        spawn_parallel_iterator(&numbers, worker)
62    }
63
64    fn worker(iter: ParIter<'_, u32>) -> Result {
65        let mut part_one = 0;
66        let mut part_two = vec![0; 130_321];
67        let mut seen = vec![u16::MAX; 130_321];
68
69        for (id, &number) in iter.enumerate() {
70            let id = id as u16;
71
72            let zeroth = number;
73            let first = hash(zeroth);
74            let second = hash(first);
75            let third = hash(second);
76
77            let mut a;
78            let mut b = to_index(zeroth, first);
79            let mut c = to_index(first, second);
80            let mut d = to_index(second, third);
81
82            let mut number = third;
83            let mut previous = third % 10;
84
85            for _ in 3..2000 {
86                number = hash(number);
87                let price = number % 10;
88
89                // Compute index into the array.
90                (a, b, c, d) = (b, c, d, to_index(previous, price));
91                let index = (6859 * a + 361 * b + 19 * c + d) as usize;
92                previous = price;
93
94                // Only sell the first time we see a sequence.
95                // By storing the id in the array we don't need to zero every iteration which is faster.
96                if seen[index] != id {
97                    part_two[index] += price as u16;
98                    seen[index] = id;
99                }
100            }
101
102            part_one += number as u64;
103        }
104
105        (part_one, part_two)
106    }
107
108    /// Compute next secret number using a
109    /// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs).
110    fn hash(mut n: u32) -> u32 {
111        n = (n ^ (n << 6)) & 0xffffff;
112        n = (n ^ (n >> 5)) & 0xffffff;
113        (n ^ (n << 11)) & 0xffffff
114    }
115
116    /// Convert -9..9 to 0..18.
117    fn to_index(previous: u32, current: u32) -> u32 {
118        9 + current % 10 - previous % 10
119    }
120}
121
122#[cfg(feature = "simd")]
123mod implementation {
124    use super::*;
125    use std::simd::Simd;
126    use std::simd::num::SimdUint as _;
127
128    type Vector = Simd<u32, 8>;
129
130    pub(super) fn parallel(input: &str) -> Vec<Result> {
131        let mut numbers: Vec<_> = input.iter_unsigned().collect();
132
133        // Add zero elements so that size is a multiple of 8.
134        // Zero always hashes to zero and does not contribute to score.
135        numbers.resize(numbers.len().next_multiple_of(8), 0);
136        let chunks: Vec<_> = numbers.chunks_exact(8).collect();
137
138        spawn_parallel_iterator(&chunks, worker)
139    }
140
141    /// Similar to scalar version but using SIMD vectors instead.
142    /// 8 lanes is the sweet spot for performance as the bottleneck is the scalar loop writing
143    /// to disjoint indices after each step.
144    fn worker(iter: ParIter<'_, &[u32]>) -> Result {
145        let ten = Simd::splat(10);
146        let x = Simd::splat(6859);
147        let y = Simd::splat(361);
148        let z = Simd::splat(19);
149
150        let mut part_one = 0;
151        let mut part_two = vec![0; 130_321];
152
153        for slice in iter {
154            // Each lane uses a different bit to track if a sequence has been seen before.
155            let mut seen = vec![u8::MAX; 130_321];
156
157            let zeroth = Simd::from_slice(slice);
158            let first = hash(zeroth);
159            let second = hash(first);
160            let third = hash(second);
161
162            let mut a;
163            let mut b = to_index(zeroth, first);
164            let mut c = to_index(first, second);
165            let mut d = to_index(second, third);
166
167            let mut number = third;
168            let mut previous = third % ten;
169
170            for _ in 3..2000 {
171                number = hash(number);
172                let prices = number % ten;
173
174                // Compute index into the array.
175                (a, b, c, d) = (b, c, d, to_index(previous, prices));
176                let indices = x * a + y * b + z * c + d;
177                previous = prices;
178
179                // Only sell the first time we see a sequence.
180                let indices = indices.to_array();
181                let prices = prices.to_array();
182
183                for i in 0..8 {
184                    let index = indices[i] as usize;
185
186                    // Avoid branching to improve speed, instead multiply by either 0 or 1,
187                    // depending if sequence has been seen before or not.
188                    let bit = (seen[index] >> i) & 1;
189                    seen[index] &= !(1 << i);
190
191                    part_two[index] += prices[i] as u16 * bit as u16;
192                }
193            }
194
195            part_one += number.reduce_sum() as u64;
196        }
197
198        (part_one, part_two)
199    }
200
201    /// SIMD vector arguments are passed in memory so inline functions to avoid slow transfers
202    /// to and from memory.
203    #[inline]
204    fn hash(mut n: Vector) -> Vector {
205        let mask = Simd::splat(0xffffff);
206        n = (n ^ (n << 6)) & mask;
207        n = (n ^ (n >> 5)) & mask;
208        (n ^ (n << 11)) & mask
209    }
210
211    #[inline]
212    fn to_index(previous: Vector, current: Vector) -> Vector {
213        let nine = Simd::splat(9);
214        let ten = Simd::splat(10);
215        nine + (current % ten) - (previous % ten)
216    }
217}