readstat/
common.rs

1//! Shared utility functions used across the crate.
2//!
3//! Provides helpers for computing streaming chunk offsets ([`build_offsets`]) and
4//! converting C string pointers to owned Rust strings ([`ptr_to_string`]).
5
6use std::ffi::CStr;
7
8use crate::err::ReadStatError;
9
10/// Computes row offset boundaries for streaming chunk-based processing.
11///
12/// Given a total `row_count` and `stream_rows` (chunk size), returns a sorted
13/// vector of offsets for use with [`windows(2)`](slice::windows) to form
14/// `[start, end)` pairs.
15///
16/// # Errors
17///
18/// Returns [`ReadStatError`] if offset computation fails.
19///
20/// # Example
21///
22/// ```
23/// # use readstat::build_offsets;
24/// let offsets = build_offsets(25, 10).unwrap();
25/// assert_eq!(offsets, vec![0, 10, 20, 25]);
26/// // Produces pairs: [0,10), [10,20), [20,25)
27/// ```
28pub fn build_offsets(row_count: u32, stream_rows: u32) -> Result<Vec<u32>, ReadStatError> {
29    let chunks = row_count.div_ceil(stream_rows.max(1));
30    let mut offsets = Vec::with_capacity(chunks as usize + 1);
31
32    for c in 0..chunks {
33        offsets.push(c * stream_rows);
34    }
35    offsets.push(row_count);
36
37    Ok(offsets)
38}
39
40/// Converts a C string pointer to an owned Rust [`String`].
41///
42/// Returns an empty string if the pointer is null. Uses lossy UTF-8 conversion
43/// to handle non-UTF-8 data gracefully.
44pub(crate) fn ptr_to_string(x: *const i8) -> String {
45    if x.is_null() {
46        String::new()
47    } else {
48        // From Rust documentation - https://doc.rust-lang.org/std/ffi/struct.CStr.html
49        let cstr = unsafe { CStr::from_ptr(x) };
50        // Get copy-on-write Cow<'_, str>, then guarantee a freshly-owned String allocation
51        String::from_utf8_lossy(cstr.to_bytes()).to_string()
52    }
53}
54
55#[cfg(test)]
56mod tests {
57    use super::*;
58    use std::ffi::CString;
59
60    // --- build_offsets tests ---
61
62    #[test]
63    fn build_offsets_exact_division() {
64        let offsets = build_offsets(30, 10).unwrap();
65        assert_eq!(offsets, vec![0, 10, 20, 30]);
66    }
67
68    #[test]
69    fn build_offsets_non_exact_division() {
70        let offsets = build_offsets(25, 10).unwrap();
71        assert_eq!(offsets, vec![0, 10, 20, 25]);
72    }
73
74    #[test]
75    fn build_offsets_stream_exceeds_row_count() {
76        let offsets = build_offsets(5, 10).unwrap();
77        assert_eq!(offsets, vec![0, 5]);
78    }
79
80    #[test]
81    fn build_offsets_single_row() {
82        let offsets = build_offsets(1, 10).unwrap();
83        assert_eq!(offsets, vec![0, 1]);
84    }
85
86    #[test]
87    fn build_offsets_equal_stream_and_rows() {
88        let offsets = build_offsets(10, 10).unwrap();
89        assert_eq!(offsets, vec![0, 10]);
90    }
91
92    #[test]
93    fn build_offsets_zero_rows() {
94        let offsets = build_offsets(0, 10).unwrap();
95        assert_eq!(offsets, vec![0]);
96        // No windows produced for zero rows
97        assert_eq!(offsets.windows(2).count(), 0);
98    }
99
100    #[test]
101    fn build_offsets_windows_produce_valid_pairs() {
102        let offsets = build_offsets(25, 10).unwrap();
103        let pairs: Vec<_> = offsets.windows(2).map(|w| (w[0], w[1])).collect();
104        assert_eq!(pairs, vec![(0, 10), (10, 20), (20, 25)]);
105    }
106
107    #[test]
108    fn build_offsets_single_chunk_windows() {
109        let offsets = build_offsets(5, 10).unwrap();
110        let pairs: Vec<_> = offsets.windows(2).map(|w| (w[0], w[1])).collect();
111        assert_eq!(pairs, vec![(0, 5)]);
112    }
113
114    #[test]
115    fn build_offsets_large_dataset() {
116        let offsets = build_offsets(100_000, 10_000).unwrap();
117        assert_eq!(offsets.len(), 11);
118        assert_eq!(*offsets.first().unwrap(), 0);
119        assert_eq!(*offsets.last().unwrap(), 100_000);
120    }
121
122    // --- ptr_to_string tests ---
123
124    #[test]
125    fn ptr_to_string_null_returns_empty() {
126        let result = ptr_to_string(std::ptr::null());
127        assert_eq!(result, "");
128    }
129
130    #[test]
131    fn ptr_to_string_valid_cstring() {
132        let cs = CString::new("hello").unwrap();
133        let result = ptr_to_string(cs.as_ptr());
134        assert_eq!(result, "hello");
135    }
136
137    #[test]
138    fn ptr_to_string_empty_cstring() {
139        let cs = CString::new("").unwrap();
140        let result = ptr_to_string(cs.as_ptr());
141        assert_eq!(result, "");
142    }
143
144    #[test]
145    fn ptr_to_string_with_unicode() {
146        let cs = CString::new("UTF-8 encoded: café").unwrap();
147        let result = ptr_to_string(cs.as_ptr());
148        assert_eq!(result, "UTF-8 encoded: café");
149    }
150
151    #[test]
152    fn ptr_to_string_with_truncated_utf8() {
153        // Simulates SAS truncating "café" at a byte boundary mid-character.
154        // "café" in UTF-8 is [63, 61, 66, C3, A9]. Truncating after 4 bytes
155        // leaves [63, 61, 66, C3] — an incomplete 2-byte sequence.
156        // ptr_to_string should replace the dangling 0xC3 with U+FFFD.
157        // Safety: we need a null-terminated buffer for CStr::from_ptr.
158        // Build one explicitly so the test is self-contained.
159        let mut buf = b"caf\xC3".to_vec();
160        buf.push(0); // null terminator
161        let ptr = buf.as_ptr().cast::<i8>();
162
163        let result = ptr_to_string(ptr);
164        assert_eq!(result, "caf\u{FFFD}");
165    }
166
167    #[test]
168    fn ptr_to_string_with_invalid_continuation_byte() {
169        // 0xFF is never valid in UTF-8
170        let mut buf = b"hello\xFFworld".to_vec();
171        buf.push(0);
172        let ptr = buf.as_ptr().cast::<i8>();
173
174        let result = ptr_to_string(ptr);
175        assert_eq!(result, "hello\u{FFFD}world");
176    }
177}