aoc/year2024/
day22.rs

1//! # Monkey Market
2//!
3//! Solves both parts simultaneously, parallelizing the work over multiple threads since
4//! each secret number is independent. The process of generating the next secret number is a
5//! [linear feedback shift register](https://en.wikipedia.org/wiki/Linear-feedback_shift_register).
6//! with a cycle of 2²⁴.
7//!
8//! Interestingly this means that with some clever math it's possible to generate the `n`th number
9//! from any starting secret number with only 24 calculations. Unfortunately this doesn't help for
10//! part two since we need to check every possible price change. However to speed things up we can
11//! make several optimizations:
12//!
13//! * First the sequence of 4 prices is converted from -9..9 to a base 19 index of 0..19.
14//! * Whether a monkey has seen a sequence before and the total bananas for each sequence are
15//!   stored in an array. This is much faster than a `HashMap`. Using base 19 gives much better
16//!   cache locality needing only 130321 elements, for example compared to shifting each new cost
17//!   by 5 bits and storing in an array of 2²⁰ = 1048675 elements. Multiplication on modern
18//!   processors is cheap (and several instructions can issue at once) but random memory access
19//!   is expensive.
20//!
21//! A SIMD variant processes 8 hashes at a time, taking about 60% of the time of the scalar version.
22//! The bottleneck is that disjoint indices must be written in sequence reducing the amount of work
23//! that can be parallelized.
24use crate::util::parse::*;
25use crate::util::thread::*;
26
27type Input = (u64, u16);
28type Result = (u64, Vec<u16>);
29
30pub fn parse(input: &str) -> Input {
31    #[cfg(not(feature = "simd"))]
32    let result = scalar::parallel(input);
33    #[cfg(feature = "simd")]
34    let result = simd::parallel(input);
35
36    // Merge results from different threads.
37    let mut part_one = 0;
38    let mut part_two = vec![0; 130321];
39
40    for (first, second) in result {
41        part_one += first;
42        part_two.iter_mut().zip(second).for_each(|(a, b)| *a += b);
43    }
44
45    (part_one, part_two.into_iter().max().unwrap())
46}
47
48pub fn part1(input: &Input) -> u64 {
49    input.0
50}
51
52pub fn part2(input: &Input) -> u16 {
53    input.1
54}
55
56#[cfg(not(feature = "simd"))]
57mod scalar {
58    use super::*;
59
60    // Use as many cores as possible to parallelize the remaining search.
61    pub(super) fn parallel(input: &str) -> Vec<Result> {
62        let numbers: Vec<_> = input.iter_unsigned().collect();
63        spawn_parallel_iterator(&numbers, worker)
64    }
65
66    fn worker(iter: ParIter<'_, u32>) -> Result {
67        let mut part_one = 0;
68        let mut part_two = vec![0; 130321];
69        let mut seen = vec![u16::MAX; 130321];
70
71        for (id, &number) in iter.enumerate() {
72            let id = id as u16;
73
74            let zeroth = number;
75            let first = hash(zeroth);
76            let second = hash(first);
77            let third = hash(second);
78
79            let mut a;
80            let mut b = to_index(zeroth, first);
81            let mut c = to_index(first, second);
82            let mut d = to_index(second, third);
83
84            let mut number = third;
85            let mut previous = third % 10;
86
87            for _ in 3..2000 {
88                number = hash(number);
89                let price = number % 10;
90
91                // Compute index into the array.
92                (a, b, c, d) = (b, c, d, to_index(previous, price));
93                let index = (6859 * a + 361 * b + 19 * c + d) as usize;
94                previous = price;
95
96                // Only sell the first time we see a sequence.
97                // By storing the id in the array we don't need to zero every iteration which is faster.
98                if seen[index] != id {
99                    part_two[index] += price as u16;
100                    seen[index] = id;
101                }
102            }
103
104            part_one += number as u64;
105        }
106
107        (part_one, part_two)
108    }
109
110    /// Compute next secret number using a
111    /// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs).
112    fn hash(mut n: u32) -> u32 {
113        n = (n ^ (n << 6)) & 0xffffff;
114        n = (n ^ (n >> 5)) & 0xffffff;
115        (n ^ (n << 11)) & 0xffffff
116    }
117
118    /// Convert -9..9 to 0..18.
119    fn to_index(previous: u32, current: u32) -> u32 {
120        9 + current % 10 - previous % 10
121    }
122}
123
124#[cfg(feature = "simd")]
125mod simd {
126    use super::*;
127    use std::simd::Simd;
128    use std::simd::num::SimdUint as _;
129
130    type Vector = Simd<u32, 8>;
131
132    pub(super) fn parallel(input: &str) -> Vec<Result> {
133        let mut numbers: Vec<_> = input.iter_unsigned().collect();
134
135        // Add zero elements so that size is a multiple of 8.
136        // Zero always hashes to zero and does not contribute to score.
137        numbers.resize(numbers.len().next_multiple_of(8), 0);
138        let chunks: Vec<_> = numbers.chunks_exact(8).collect();
139
140        spawn_parallel_iterator(&chunks, worker)
141    }
142
143    /// Similar to scalar version but using SIMD vectors instead.
144    /// 8 lanes is the sweet spot for performance as the bottleneck is the scalar loop writing
145    /// to disjoint indices after each step.
146    fn worker(iter: ParIter<'_, &[u32]>) -> Result {
147        let ten = Simd::splat(10);
148        let x = Simd::splat(6859);
149        let y = Simd::splat(361);
150        let z = Simd::splat(19);
151
152        let mut part_one = 0;
153        let mut part_two = vec![0; 130321];
154
155        for slice in iter {
156            // Each lane uses a different bit to track if a sequence has been seen before.
157            let mut seen = vec![u8::MAX; 130321];
158
159            let zeroth = Simd::from_slice(slice);
160            let first = hash(zeroth);
161            let second = hash(first);
162            let third = hash(second);
163
164            let mut a;
165            let mut b = to_index(zeroth, first);
166            let mut c = to_index(first, second);
167            let mut d = to_index(second, third);
168
169            let mut number = third;
170            let mut previous = third % ten;
171
172            for _ in 3..2000 {
173                number = hash(number);
174                let prices = number % ten;
175
176                // Compute index into the array.
177                (a, b, c, d) = (b, c, d, to_index(previous, prices));
178                let indices = x * a + y * b + z * c + d;
179                previous = prices;
180
181                // Only sell the first time we see a sequence.
182                let indices = indices.to_array();
183                let prices = prices.to_array();
184
185                for i in 0..8 {
186                    let index = indices[i] as usize;
187
188                    // Avoid branching to improve speed, instead multiply by either 0 or 1,
189                    // depending if sequence has been seen before or not.
190                    let bit = (seen[index] >> i) & 1;
191                    seen[index] &= !(1 << i);
192
193                    part_two[index] += prices[i] as u16 * bit as u16;
194                }
195            }
196
197            part_one += number.reduce_sum() as u64;
198        }
199
200        (part_one, part_two)
201    }
202
203    /// SIMD vector arguments are passed in memory so inline functions to avoid slow transfers
204    /// to and from memory.
205    #[inline]
206    fn hash(mut n: Vector) -> Vector {
207        let mask = Simd::splat(0xffffff);
208        n = (n ^ (n << 6)) & mask;
209        n = (n ^ (n >> 5)) & mask;
210        (n ^ (n << 11)) & mask
211    }
212
213    #[inline]
214    fn to_index(previous: Vector, current: Vector) -> Vector {
215        let nine = Simd::splat(9);
216        let ten = Simd::splat(10);
217        nine + (current % ten) - (previous % ten)
218    }
219}