Skip to main content

readstat/
common.rs

1//! Shared utility functions used across the crate.
2//!
3//! Provides helpers for computing streaming chunk offsets ([`build_offsets`]) and
4//! converting C string pointers to owned Rust strings ([`ptr_to_string`]).
5
6use std::ffi::CStr;
7
8/// Computes row offset boundaries for streaming chunk-based processing.
9///
10/// Given a total `row_count` and `stream_rows` (chunk size), returns a sorted
11/// vector of offsets for use with [`windows(2)`](slice::windows) to form
12/// `[start, end)` pairs. If `stream_rows` is 0, it is treated as 1.
13///
14/// # Example
15///
16/// ```
17/// # use readstat::build_offsets;
18/// let offsets = build_offsets(25, 10);
19/// assert_eq!(offsets, vec![0, 10, 20, 25]);
20/// // Produces pairs: [0,10), [10,20), [20,25)
21/// ```
22pub fn build_offsets(row_count: u32, stream_rows: u32) -> Vec<u32> {
23    let chunks = row_count.div_ceil(stream_rows.max(1));
24    let mut offsets = Vec::with_capacity(chunks as usize + 1);
25
26    for c in 0..chunks {
27        offsets.push(c * stream_rows);
28    }
29    offsets.push(row_count);
30
31    offsets
32}
33
34/// Converts a C string pointer to an owned Rust [`String`].
35///
36/// Returns an empty string if the pointer is null. Uses lossy UTF-8 conversion
37/// to handle non-UTF-8 data gracefully.
38pub(crate) fn ptr_to_string(x: *const std::os::raw::c_char) -> String {
39    if x.is_null() {
40        String::new()
41    } else {
42        // From Rust documentation - https://doc.rust-lang.org/std/ffi/struct.CStr.html
43        let cstr = unsafe { CStr::from_ptr(x) };
44        // Get copy-on-write Cow<'_, str>, then guarantee a freshly-owned String allocation
45        String::from_utf8_lossy(cstr.to_bytes()).to_string()
46    }
47}
48
49#[cfg(test)]
50mod tests {
51    use super::*;
52    use std::ffi::CString;
53
54    // --- build_offsets tests ---
55
56    #[test]
57    fn build_offsets_exact_division() {
58        let offsets = build_offsets(30, 10);
59        assert_eq!(offsets, vec![0, 10, 20, 30]);
60    }
61
62    #[test]
63    fn build_offsets_non_exact_division() {
64        let offsets = build_offsets(25, 10);
65        assert_eq!(offsets, vec![0, 10, 20, 25]);
66    }
67
68    #[test]
69    fn build_offsets_stream_exceeds_row_count() {
70        let offsets = build_offsets(5, 10);
71        assert_eq!(offsets, vec![0, 5]);
72    }
73
74    #[test]
75    fn build_offsets_single_row() {
76        let offsets = build_offsets(1, 10);
77        assert_eq!(offsets, vec![0, 1]);
78    }
79
80    #[test]
81    fn build_offsets_equal_stream_and_rows() {
82        let offsets = build_offsets(10, 10);
83        assert_eq!(offsets, vec![0, 10]);
84    }
85
86    #[test]
87    fn build_offsets_zero_rows() {
88        let offsets = build_offsets(0, 10);
89        assert_eq!(offsets, vec![0]);
90        // No windows produced for zero rows
91        assert_eq!(offsets.windows(2).count(), 0);
92    }
93
94    #[test]
95    fn build_offsets_windows_produce_valid_pairs() {
96        let offsets = build_offsets(25, 10);
97        let pairs: Vec<_> = offsets.windows(2).map(|w| (w[0], w[1])).collect();
98        assert_eq!(pairs, vec![(0, 10), (10, 20), (20, 25)]);
99    }
100
101    #[test]
102    fn build_offsets_single_chunk_windows() {
103        let offsets = build_offsets(5, 10);
104        let pairs: Vec<_> = offsets.windows(2).map(|w| (w[0], w[1])).collect();
105        assert_eq!(pairs, vec![(0, 5)]);
106    }
107
108    #[test]
109    fn build_offsets_large_dataset() {
110        let offsets = build_offsets(100_000, 10_000);
111        assert_eq!(offsets.len(), 11);
112        assert_eq!(*offsets.first().unwrap(), 0);
113        assert_eq!(*offsets.last().unwrap(), 100_000);
114    }
115
116    // --- ptr_to_string tests ---
117
118    #[test]
119    fn ptr_to_string_null_returns_empty() {
120        let result = ptr_to_string(std::ptr::null());
121        assert_eq!(result, "");
122    }
123
124    #[test]
125    fn ptr_to_string_valid_cstring() {
126        let cs = CString::new("hello").unwrap();
127        let result = ptr_to_string(cs.as_ptr());
128        assert_eq!(result, "hello");
129    }
130
131    #[test]
132    fn ptr_to_string_empty_cstring() {
133        let cs = CString::new("").unwrap();
134        let result = ptr_to_string(cs.as_ptr());
135        assert_eq!(result, "");
136    }
137
138    #[test]
139    fn ptr_to_string_with_unicode() {
140        let cs = CString::new("UTF-8 encoded: café").unwrap();
141        let result = ptr_to_string(cs.as_ptr());
142        assert_eq!(result, "UTF-8 encoded: café");
143    }
144
145    #[test]
146    fn ptr_to_string_with_truncated_utf8() {
147        // Simulates SAS truncating "café" at a byte boundary mid-character.
148        // "café" in UTF-8 is [63, 61, 66, C3, A9]. Truncating after 4 bytes
149        // leaves [63, 61, 66, C3] — an incomplete 2-byte sequence.
150        // ptr_to_string should replace the dangling 0xC3 with U+FFFD.
151        // Safety: we need a null-terminated buffer for CStr::from_ptr.
152        // Build one explicitly so the test is self-contained.
153        let mut buf = b"caf\xC3".to_vec();
154        buf.push(0); // null terminator
155        let ptr = buf.as_ptr().cast::<std::os::raw::c_char>();
156
157        let result = ptr_to_string(ptr);
158        assert_eq!(result, "caf\u{FFFD}");
159    }
160
161    #[test]
162    fn ptr_to_string_with_invalid_continuation_byte() {
163        // 0xFF is never valid in UTF-8
164        let mut buf = b"hello\xFFworld".to_vec();
165        buf.push(0);
166        let ptr = buf.as_ptr().cast::<std::os::raw::c_char>();
167
168        let result = ptr_to_string(ptr);
169        assert_eq!(result, "hello\u{FFFD}world");
170    }
171
172    // --- Property-based tests ---
173
174    mod property_tests {
175        use super::*;
176        use proptest::prelude::*;
177
178        proptest! {
179            /// First offset is always 0; last offset is always row_count.
180            #[test]
181            fn offsets_start_at_zero_end_at_row_count(
182                row_count in 0u32..100_000,
183                stream_rows in 1u32..50_000
184            ) {
185                let offsets = build_offsets(row_count, stream_rows);
186                prop_assert_eq!(*offsets.first().unwrap(), 0);
187                prop_assert_eq!(*offsets.last().unwrap(), row_count);
188            }
189
190            /// Offsets are strictly monotonically increasing (no duplicates, no going backwards).
191            #[test]
192            fn offsets_are_monotonically_increasing(
193                row_count in 1u32..100_000,
194                stream_rows in 1u32..50_000
195            ) {
196                let offsets = build_offsets(row_count, stream_rows);
197                for pair in offsets.windows(2) {
198                    prop_assert!(pair[0] < pair[1], "offsets not strictly increasing: {} >= {}", pair[0], pair[1]);
199                }
200            }
201
202            /// Every chunk (window pair) has size <= stream_rows.
203            #[test]
204            fn chunk_sizes_bounded_by_stream_rows(
205                row_count in 1u32..100_000,
206                stream_rows in 1u32..50_000
207            ) {
208                let offsets = build_offsets(row_count, stream_rows);
209                for pair in offsets.windows(2) {
210                    let chunk_size = pair[1] - pair[0];
211                    prop_assert!(chunk_size <= stream_rows, "chunk {} > stream_rows {}", chunk_size, stream_rows);
212                }
213            }
214
215            /// The chunks cover all rows: sum of chunk sizes equals row_count.
216            #[test]
217            fn chunks_cover_all_rows(
218                row_count in 0u32..100_000,
219                stream_rows in 1u32..50_000
220            ) {
221                let offsets = build_offsets(row_count, stream_rows);
222                let total: u32 = offsets.windows(2).map(|w| w[1] - w[0]).sum();
223                prop_assert_eq!(total, row_count);
224            }
225
226            /// Zero stream_rows is handled without panic (treated as 1).
227            #[test]
228            fn zero_stream_rows_does_not_panic(row_count in 0u32..10_000) {
229                let offsets = build_offsets(row_count, 0);
230                prop_assert_eq!(*offsets.first().unwrap(), 0);
231                prop_assert_eq!(*offsets.last().unwrap(), row_count);
232            }
233        }
234    }
235}