readstat/
formats.rs

1//! SAS format string classification using regex-based detection.
2//!
3//! SAS variables carry format strings (e.g. `DATE9`, `DATETIME22.3`, `TIME8`) that
4//! determine how raw numeric values should be interpreted. This module classifies
5//! those format strings into [`ReadStatVarFormatClass`] variants (Date, `DateTime`,
6//! Time, and their sub-second precision variants), enabling correct Arrow type mapping.
7//!
8//! Supports all 118+ SAS date/time/datetime formats including ISO 8601 variants,
9//! national language (`NL*`) formats, and precision-based datetime/time formats.
10
11use std::sync::LazyLock;
12
13use regex::Regex;
14
15use crate::rs_var::ReadStatVarFormatClass;
16
17// DATETIME with nanosecond precision (DATETIMEw.d where d=7-9)
18static RE_DATETIME_WITH_NANO: LazyLock<Regex> =
19    LazyLock::new(|| Regex::new(r"(?xi)^DATETIME[0-9]{1,2}\.[7-9]$").unwrap());
20
21// DATETIME with microsecond precision (DATETIMEw.d where d=4-6)
22static RE_DATETIME_WITH_MICRO: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"(?xi)^DATETIME[0-9]{1,2}\.[4-6]$").unwrap());
24
25// DATETIME with millisecond precision (DATETIMEw.d where d=1-3)
26static RE_DATETIME_WITH_MILLI: LazyLock<Regex> =
27    LazyLock::new(|| Regex::new(r"(?xi)^DATETIME[0-9]{1,2}\.[1-3]$").unwrap());
28
29// TIME with microsecond precision (TIMEw.d where d=4-6)
30static RE_TIME_WITH_MICRO: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"(?xi)^TIME[0-9]{1,2}\.[4-6]$").unwrap());
32
33// All time formats - checked before datetime to catch NLDATMTM and NLDATMTZ
34// Suffix allows letter width/decimal (W, WD) and/or numeric width/decimal (8, 8.2)
35static RE_TIME: LazyLock<Regex> = LazyLock::new(|| {
36    Regex::new(
37        r"(?xi)
38        ^(
39            B8601LZ  |
40            B8601TM  |
41            B8601TX  |
42            B8601TZ  |
43            E8601LZ  |
44            E8601TM  |
45            E8601TX  |
46            E8601TZ  |
47            HHMM     |
48            HOUR     |
49            MMSS     |
50            NLDATMTM |
51            NLDATMTZ |
52            NLTIMAP  |
53            NLTIME   |
54            TIMEAMPM |
55            TIME     |
56            TOD
57        )[A-Z0-9]*(\.[A-Z0-9]*)?$",
58    )
59    .unwrap()
60});
61
62// All datetime formats - checked before date to catch DATEAMPM and DATETIME
63// NLDATM matches all NLDATM* variants; NLDATMTM/NLDATMTZ already caught by RE_TIME
64static RE_DATETIME: LazyLock<Regex> = LazyLock::new(|| {
65    Regex::new(
66        r"(?xi)
67        ^(
68            B8601DT  |
69            B8601DX  |
70            B8601DZ  |
71            B8601LX  |
72            DATEAMPM |
73            DATETIME |
74            E8601DT  |
75            E8601DX  |
76            E8601DZ  |
77            E8601LX  |
78            MDYAMPM  |
79            NLDATM
80        )[A-Z0-9]*(\.[A-Z0-9]*)?$",
81    )
82    .unwrap()
83});
84
85// All date formats
86static RE_DATE: LazyLock<Regex> = LazyLock::new(|| {
87    Regex::new(
88        r"(?xi)
89        ^(
90            B8601DA   |
91            B8601DN   |
92            DATE      |
93            DAY       |
94            DDMMYY    |
95            DOWNAME   |
96            DTDATE    |
97            DTMONXY   |
98            DTWKDATX  |
99            DTYEAR    |
100            DTYYQC    |
101            E8601DA   |
102            E8601DN   |
103            JULDAY    |
104            JULIAN    |
105            MMDDYY    |
106            MMYY      |
107            MONNAME   |
108            MONTH     |
109            MONYY     |
110            NENGO     |
111            NLDATE    |
112            QTRR?     |
113            WEEKDATX  |
114            WEEKDAY   |
115            YEAR      |
116            YYMMDD    |
117            YYMM      |
118            YYMON     |
119            YYQR      |
120            YYQ       |
121            YYWEEK[UVW]
122        )[A-Z0-9]*(\.[A-Z0-9]*)?$",
123    )
124    .unwrap()
125});
126
127/// Classifies a SAS format string into a [`ReadStatVarFormatClass`].
128///
129/// Returns `Some(class)` for recognized date/time/datetime formats, or `None`
130/// for numeric/character formats that don't represent temporal data.
131/// Matching is case-insensitive and handles both numeric widths (`DATE9`)
132/// and letter-width suffixes (`DATEW`).
133pub(crate) fn match_var_format(v: &str) -> Option<ReadStatVarFormatClass> {
134    // Check order matters:
135    // 1. DATETIME precision variants (most specific, numeric width only)
136    // 2. TIME precision variants (most specific, numeric width only)
137    // 3. Time (catches NLDATMTM, NLDATMTZ before general NLDATM datetime match)
138    // 4. General datetime (catches DATEAMPM, DATETIME before DATE match)
139    // 5. Date (everything else)
140    if RE_DATETIME_WITH_NANO.is_match(v) {
141        Some(ReadStatVarFormatClass::DateTimeWithNanoseconds)
142    } else if RE_DATETIME_WITH_MICRO.is_match(v) {
143        Some(ReadStatVarFormatClass::DateTimeWithMicroseconds)
144    } else if RE_DATETIME_WITH_MILLI.is_match(v) {
145        Some(ReadStatVarFormatClass::DateTimeWithMilliseconds)
146    } else if RE_TIME_WITH_MICRO.is_match(v) {
147        Some(ReadStatVarFormatClass::TimeWithMicroseconds)
148    } else if RE_TIME.is_match(v) {
149        Some(ReadStatVarFormatClass::Time)
150    } else if RE_DATETIME.is_match(v) {
151        Some(ReadStatVarFormatClass::DateTime)
152    } else if RE_DATE.is_match(v) {
153        Some(ReadStatVarFormatClass::Date)
154    } else {
155        None
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    // --- Date formats ---
164
165    #[test]
166    fn date_formats_with_numeric_width() {
167        // Existing formats that were already supported
168        assert_eq!(
169            match_var_format("DATE9"),
170            Some(ReadStatVarFormatClass::Date)
171        );
172        assert_eq!(
173            match_var_format("DDMMYY10"),
174            Some(ReadStatVarFormatClass::Date)
175        );
176        assert_eq!(
177            match_var_format("DDMMYYB10"),
178            Some(ReadStatVarFormatClass::Date)
179        );
180        assert_eq!(
181            match_var_format("MMDDYY10"),
182            Some(ReadStatVarFormatClass::Date)
183        );
184        assert_eq!(
185            match_var_format("YYMMDD10"),
186            Some(ReadStatVarFormatClass::Date)
187        );
188    }
189
190    #[test]
191    fn date_formats_with_letter_width() {
192        // Format strings as stored in the test SAS datasets
193        let date_formats = [
194            "B8601DAW",
195            "B8601DNW",
196            "DATEW",
197            "DAYW",
198            "DDMMYYW",
199            "DDMMYYXW",
200            "DOWNAMEW",
201            "DTDATEW",
202            "DTMONXYW",
203            "DTWKDATXW",
204            "DTYEARW",
205            "DTYYQCW",
206            "E8601DAW",
207            "E8601DNW",
208            "JULDAYW",
209            "JULIANW",
210            "MMDDYYW",
211            "MMDDYYXW",
212            "MMYYW",
213            "MMYYXW",
214            "MONNAMEW",
215            "MONTHW",
216            "MONYYW",
217            "NENGOW",
218            "NLDATEW",
219            "NLDATECPWP",
220            "NLDATELW",
221            "NLDATEMW",
222            "NLDATEMDW",
223            "NLDATEMDLW",
224            "NLDATEMDMW",
225            "NLDATEMDSW",
226            "NLDATEMNW",
227            "NLDATESW",
228            "NLDATEWW",
229            "NLDATEWNW",
230            "NLDATEYMW",
231            "NLDATEYMLW",
232            "NLDATEYMMW",
233            "NLDATEYMSW",
234            "NLDATEYQW",
235            "NLDATEYQLW",
236            "NLDATEYQMW",
237            "NLDATEYQSW",
238            "NLDATEYRW",
239            "NLDATEYWW",
240            "QTRW",
241            "QTRRW",
242            "WEEKDATXW",
243            "WEEKDAYW",
244            "YEARW",
245            "YYMMW",
246            "YYMMDDW",
247            "YYMMDDXW",
248            "YYMMXW",
249            "YYMONW",
250            "YYQW",
251            "YYQXW",
252            "YYQRW",
253            "YYQRXW",
254            "YYWEEKUW",
255            "YYWEEKVW",
256            "YYWEEKWW",
257        ];
258        for fmt in &date_formats {
259            assert_eq!(
260                match_var_format(fmt),
261                Some(ReadStatVarFormatClass::Date),
262                "Expected Date for format: {fmt}"
263            );
264        }
265    }
266
267    // --- Time formats ---
268
269    #[test]
270    fn time_format_bare() {
271        assert_eq!(match_var_format("TIME"), Some(ReadStatVarFormatClass::Time));
272        assert_eq!(
273            match_var_format("TIME8"),
274            Some(ReadStatVarFormatClass::Time)
275        );
276    }
277
278    #[test]
279    fn time_formats_with_letter_width() {
280        let time_formats = [
281            "B8601LZW",
282            "B8601TMWD",
283            "B8601TXW",
284            "B8601TZW",
285            "E8601LZW",
286            "E8601TMWD",
287            "E8601TXW",
288            "E8601TZWD",
289            "HHMMWD",
290            "HOURWD",
291            "MMSSWD",
292            "NLDATMTMW",
293            "NLDATMTZW",
294            "NLTIMAPW",
295            "NLTIMEW",
296            "TIMEWD",
297            "TIMEAMPMWD",
298            "TODWD",
299        ];
300        for fmt in &time_formats {
301            assert_eq!(
302                match_var_format(fmt),
303                Some(ReadStatVarFormatClass::Time),
304                "Expected Time for format: {fmt}"
305            );
306        }
307    }
308
309    // --- Datetime formats ---
310
311    #[test]
312    fn datetime_format_with_numeric_width() {
313        assert_eq!(
314            match_var_format("DATETIME22"),
315            Some(ReadStatVarFormatClass::DateTime)
316        );
317    }
318
319    #[test]
320    fn time_precision_formats() {
321        assert_eq!(
322            match_var_format("TIME15.6"),
323            Some(ReadStatVarFormatClass::TimeWithMicroseconds)
324        );
325        assert_eq!(
326            match_var_format("TIME15.4"),
327            Some(ReadStatVarFormatClass::TimeWithMicroseconds)
328        );
329        assert_eq!(
330            match_var_format("TIME15.5"),
331            Some(ReadStatVarFormatClass::TimeWithMicroseconds)
332        );
333        // Without precision decimal, should be plain Time
334        assert_eq!(
335            match_var_format("TIME15"),
336            Some(ReadStatVarFormatClass::Time)
337        );
338    }
339
340    #[test]
341    fn datetime_precision_formats() {
342        assert_eq!(
343            match_var_format("DATETIME22.3"),
344            Some(ReadStatVarFormatClass::DateTimeWithMilliseconds)
345        );
346        assert_eq!(
347            match_var_format("DATETIME22.6"),
348            Some(ReadStatVarFormatClass::DateTimeWithMicroseconds)
349        );
350        assert_eq!(
351            match_var_format("DATETIME22.9"),
352            Some(ReadStatVarFormatClass::DateTimeWithNanoseconds)
353        );
354    }
355
356    #[test]
357    fn datetime_formats_with_letter_width() {
358        let datetime_formats = [
359            "B8601DTWD",
360            "B8601DXW",
361            "B8601DZW",
362            "B8601LXW",
363            "DATEAMPMWD",
364            "DATETIMEWD",
365            "E8601DTWD",
366            "E8601DXW",
367            "E8601DZW",
368            "E8601LXW",
369            "MDYAMPMWD",
370            "NLDATMW",
371            "NLDATMAPW",
372            "NLDATMCPWP",
373            "NLDATMDTW",
374            "NLDATMLW",
375            "NLDATMMW",
376            "NLDATMMDW",
377            "NLDATMMDLW",
378            "NLDATMMDMW",
379            "NLDATMMDSW",
380            "NLDATMMNW",
381            "NLDATMSW",
382            "NLDATMWW",
383            "NLDATMWNW",
384            "NLDATMWZW",
385            "NLDATMYMW",
386            "NLDATMYMLW",
387            "NLDATMYMMW",
388            "NLDATMYMSW",
389            "NLDATMYQW",
390            "NLDATMYQLW",
391            "NLDATMYQMW",
392            "NLDATMYQSW",
393            "NLDATMYRW",
394            "NLDATMYWW",
395            "NLDATMZW",
396        ];
397        for fmt in &datetime_formats {
398            assert_eq!(
399                match_var_format(fmt),
400                Some(ReadStatVarFormatClass::DateTime),
401                "Expected DateTime for format: {fmt}"
402            );
403        }
404    }
405
406    // --- Non-matching formats ---
407
408    #[test]
409    fn non_date_time_formats() {
410        assert_eq!(match_var_format("BEST12"), None);
411        assert_eq!(match_var_format("$30"), None);
412        assert_eq!(match_var_format("$10"), None);
413        assert_eq!(match_var_format("COMMA12"), None);
414        assert_eq!(match_var_format(""), None);
415    }
416
417    // --- Case insensitivity ---
418
419    #[test]
420    fn case_insensitive() {
421        assert_eq!(
422            match_var_format("date9"),
423            Some(ReadStatVarFormatClass::Date)
424        );
425        assert_eq!(
426            match_var_format("datetime22"),
427            Some(ReadStatVarFormatClass::DateTime)
428        );
429        assert_eq!(
430            match_var_format("time8"),
431            Some(ReadStatVarFormatClass::Time)
432        );
433    }
434}