1use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
9use log::debug;
10use num_derive::FromPrimitive;
11use serde::Serialize;
12#[cfg(any(not(target_arch = "wasm32"), test))]
13use std::fs::File;
14use std::{
15 collections::{BTreeMap, BTreeSet, HashMap},
16 ffi::{CString, c_void},
17 path::Path,
18};
19
20use crate::cb::{handle_metadata, handle_variable};
21use crate::err::{ReadStatError, check_c_error};
22use crate::rs_buffer_io::ReadStatBufferCtx;
23use crate::rs_parser::ReadStatParser;
24use crate::rs_path::ReadStatPath;
25use crate::rs_var::{ReadStatVarFormatClass, ReadStatVarType, ReadStatVarTypeClass};
26
27#[derive(Clone, Debug, Serialize)]
33pub struct ReadStatMetadata {
34 pub row_count: i32,
36 pub var_count: i32,
38 pub table_name: String,
40 pub file_label: String,
42 pub file_encoding: String,
44 pub version: i32,
46 pub is_64bit: bool,
48 pub creation_time: String,
50 pub modified_time: String,
52 pub compression: ReadStatCompress,
54 pub endianness: ReadStatEndian,
56 pub vars: BTreeMap<i32, ReadStatVarMetadata>,
58 #[serde(skip_serializing)]
60 pub schema: Schema,
61}
62
63impl Default for ReadStatMetadata {
64 fn default() -> Self {
65 Self::new()
66 }
67}
68
69impl ReadStatMetadata {
70 pub fn new() -> Self {
72 Self {
73 row_count: 0,
74 var_count: 0,
75 table_name: String::new(),
76 file_label: String::new(),
77 file_encoding: String::new(),
78 version: 0,
79 is_64bit: false,
80 creation_time: String::new(),
81 modified_time: String::new(),
82 compression: ReadStatCompress::None,
83 endianness: ReadStatEndian::None,
84 vars: BTreeMap::new(),
85 schema: Schema::empty(),
86 }
87 }
88
89 fn initialize_schema(&self) -> Schema {
90 let fields: Vec<Field> = self
92 .vars
93 .values()
94 .map(|vm| {
95 let var_dt = match &vm.var_type {
96 ReadStatVarType::String
97 | ReadStatVarType::StringRef
98 | ReadStatVarType::Unknown => DataType::Utf8,
99 ReadStatVarType::Int8 | ReadStatVarType::Int16 => DataType::Int16,
100 ReadStatVarType::Int32 => DataType::Int32,
101 ReadStatVarType::Float => DataType::Float32,
102 ReadStatVarType::Double => match &vm.var_format_class {
103 Some(ReadStatVarFormatClass::Date) => DataType::Date32,
104 Some(ReadStatVarFormatClass::DateTime) => {
105 DataType::Timestamp(TimeUnit::Second, None)
106 }
107 Some(ReadStatVarFormatClass::DateTimeWithMilliseconds) => {
108 DataType::Timestamp(TimeUnit::Millisecond, None)
109 }
110 Some(ReadStatVarFormatClass::DateTimeWithMicroseconds) => {
111 DataType::Timestamp(TimeUnit::Microsecond, None)
112 }
113 Some(ReadStatVarFormatClass::DateTimeWithNanoseconds) => {
114 DataType::Timestamp(TimeUnit::Nanosecond, None)
115 }
116 Some(ReadStatVarFormatClass::Time) => DataType::Time32(TimeUnit::Second),
117 Some(ReadStatVarFormatClass::TimeWithMilliseconds) => {
118 DataType::Time32(TimeUnit::Millisecond)
119 }
120 Some(ReadStatVarFormatClass::TimeWithMicroseconds) => {
121 DataType::Time64(TimeUnit::Microsecond)
122 }
123 Some(ReadStatVarFormatClass::TimeWithNanoseconds) => {
124 DataType::Time64(TimeUnit::Nanosecond)
125 }
126 None => DataType::Float64,
127 },
128 };
129
130 let mut field = Field::new(&vm.var_name, var_dt, true);
132 let mut metadata = HashMap::new();
133 if !vm.var_label.is_empty() {
134 metadata.insert("label".to_string(), vm.var_label.clone());
135 }
136 if !vm.var_format.is_empty() {
137 metadata.insert("sas_format".to_string(), vm.var_format.clone());
138 }
139 metadata.insert("storage_width".to_string(), vm.storage_width.to_string());
140 if vm.display_width != 0 {
141 metadata.insert("display_width".to_string(), vm.display_width.to_string());
142 }
143 if !metadata.is_empty() {
144 field = field.with_metadata(metadata);
145 }
146 field
147 })
148 .collect();
149
150 if self.file_label.is_empty() {
152 Schema::new(fields)
153 } else {
154 let mut schema_metadata = HashMap::new();
155 schema_metadata.insert("table_label".to_string(), self.file_label.clone());
156 Schema::new_with_metadata(fields, schema_metadata)
157 }
158 }
159
160 #[allow(clippy::cast_possible_wrap, clippy::ptr_as_ptr)]
171 pub fn read_metadata(
172 &mut self,
173 rsp: &ReadStatPath,
174 skip_row_count: bool,
175 ) -> Result<(), ReadStatError> {
176 debug!("Path as C string is {:?}", rsp.cstring_path);
177 let ppath = rsp.cstring_path.as_ptr();
178
179 let ctx = std::ptr::from_mut::<Self>(self) as *mut c_void;
180
181 let error: readstat_sys::readstat_error_t = readstat_sys::readstat_error_e_READSTAT_OK;
182 debug!("Initially, error ==> {error}");
183
184 let row_limit = if skip_row_count { Some(1) } else { None };
185
186 let error = ReadStatParser::new()?
187 .set_metadata_handler(Some(handle_metadata))?
188 .set_variable_handler(Some(handle_variable))?
189 .set_row_limit(row_limit)?
190 .parse_sas7bdat(ppath, ctx);
191
192 check_c_error(error as i32)?;
193
194 self.schema = self.initialize_schema();
196 Ok(())
197 }
198
199 #[allow(clippy::cast_possible_wrap, clippy::ptr_as_ptr)]
213 pub fn read_metadata_from_bytes(
214 &mut self,
215 bytes: &[u8],
216 skip_row_count: bool,
217 ) -> Result<(), ReadStatError> {
218 let mut buffer_ctx = ReadStatBufferCtx::new(bytes);
219
220 let ctx = std::ptr::from_mut::<Self>(self) as *mut c_void;
221
222 let error: readstat_sys::readstat_error_t = readstat_sys::readstat_error_e_READSTAT_OK;
223 debug!("Initially, error ==> {error}");
224
225 let row_limit = if skip_row_count { Some(1) } else { None };
226
227 let dummy_path = CString::new("").expect("empty string is valid C string");
229
230 let error = buffer_ctx
231 .configure_parser(
232 ReadStatParser::new()?
233 .set_metadata_handler(Some(handle_metadata))?
234 .set_variable_handler(Some(handle_variable))?
235 .set_row_limit(row_limit)?,
236 )?
237 .parse_sas7bdat(dummy_path.as_ptr(), ctx);
238
239 check_c_error(error as i32)?;
240
241 self.schema = self.initialize_schema();
243 Ok(())
244 }
245
246 #[cfg(not(target_arch = "wasm32"))]
263 pub fn read_metadata_from_mmap(
264 &mut self,
265 path: &Path,
266 skip_row_count: bool,
267 ) -> Result<(), ReadStatError> {
268 let file = File::open(path)?;
269 let mmap = unsafe { memmap2::Mmap::map(&file)? };
270 self.read_metadata_from_bytes(&mmap, skip_row_count)
271 }
272
273 pub fn parse_columns_file(path: &Path) -> Result<Vec<String>, ReadStatError> {
282 let contents = std::fs::read_to_string(path)?;
283 let names: Vec<String> = contents
284 .lines()
285 .map(str::trim)
286 .filter(|line| !line.is_empty() && !line.starts_with('#'))
287 .map(std::string::ToString::to_string)
288 .collect();
289 Ok(names)
290 }
291
292 pub fn resolve_selected_columns(
303 &self,
304 columns: Option<Vec<String>>,
305 ) -> Result<Option<BTreeMap<i32, i32>>, ReadStatError> {
306 let Some(columns) = columns else {
307 return Ok(None);
308 };
309
310 let requested: BTreeSet<String> = columns.into_iter().collect();
312
313 let name_to_index: HashMap<&str, i32> = self
315 .vars
316 .iter()
317 .map(|(&idx, vm)| (vm.var_name.as_str(), idx))
318 .collect();
319
320 let not_found: Vec<String> = requested
322 .iter()
323 .filter(|name| !name_to_index.contains_key(name.as_str()))
324 .cloned()
325 .collect();
326
327 if !not_found.is_empty() {
328 let available: Vec<String> = self.vars.values().map(|vm| vm.var_name.clone()).collect();
329 return Err(ReadStatError::ColumnsNotFound {
330 requested: not_found,
331 available,
332 });
333 }
334
335 let mut mapping = BTreeMap::new();
338 let mut new_index = 0i32;
339 for (&orig_index, vm) in &self.vars {
340 if requested.contains(&vm.var_name) {
341 mapping.insert(orig_index, new_index);
342 new_index += 1;
343 }
344 }
345
346 Ok(Some(mapping))
347 }
348
349 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
355 pub fn filter_to_selected_columns(&self, mapping: &BTreeMap<i32, i32>) -> Self {
356 let new_vars: BTreeMap<i32, ReadStatVarMetadata> = mapping
357 .iter()
358 .filter_map(|(&orig_idx, &new_idx)| {
359 self.vars.get(&orig_idx).map(|vm| (new_idx, vm.clone()))
360 })
361 .collect();
362
363 let mut filtered = Self {
364 row_count: self.row_count,
365 var_count: mapping.len() as i32,
366 table_name: self.table_name.clone(),
367 file_label: self.file_label.clone(),
368 file_encoding: self.file_encoding.clone(),
369 version: self.version,
370 is_64bit: self.is_64bit,
371 creation_time: self.creation_time.clone(),
372 modified_time: self.modified_time.clone(),
373 compression: self.compression.clone(),
374 endianness: self.endianness.clone(),
375 vars: new_vars,
376 schema: Schema::empty(),
377 };
378 filtered.schema = filtered.initialize_schema();
379 filtered
380 }
381}
382
383#[non_exhaustive]
388#[derive(Clone, Debug, Default, FromPrimitive, Serialize)]
389#[allow(clippy::cast_possible_wrap)]
390pub enum ReadStatCompress {
391 #[default]
393 None = readstat_sys::readstat_compress_e_READSTAT_COMPRESS_NONE as isize,
394 Rows = readstat_sys::readstat_compress_e_READSTAT_COMPRESS_ROWS as isize,
396 Binary = readstat_sys::readstat_compress_e_READSTAT_COMPRESS_BINARY as isize,
398}
399
400#[non_exhaustive]
405#[derive(Clone, Debug, Default, FromPrimitive, Serialize)]
406#[allow(clippy::cast_possible_wrap)]
407pub enum ReadStatEndian {
408 #[default]
410 None = readstat_sys::readstat_endian_e_READSTAT_ENDIAN_NONE as isize,
411 Little = readstat_sys::readstat_endian_e_READSTAT_ENDIAN_LITTLE as isize,
413 Big = readstat_sys::readstat_endian_e_READSTAT_ENDIAN_BIG as isize,
415}
416
417#[derive(Clone, Debug, Serialize)]
419pub struct ReadStatVarMetadata {
420 pub var_name: String,
422 pub var_type: ReadStatVarType,
424 pub var_type_class: ReadStatVarTypeClass,
426 pub var_label: String,
428 pub var_format: String,
430 pub var_format_class: Option<ReadStatVarFormatClass>,
432 pub storage_width: usize,
435 pub display_width: i32,
437}
438
439impl ReadStatVarMetadata {
440 #[allow(clippy::too_many_arguments)]
442 pub fn new(
443 var_name: String,
444 var_type: ReadStatVarType,
445 var_type_class: ReadStatVarTypeClass,
446 var_label: String,
447 var_format: String,
448 var_format_class: Option<ReadStatVarFormatClass>,
449 storage_width: usize,
450 display_width: i32,
451 ) -> Self {
452 Self {
453 var_name,
454 var_type,
455 var_type_class,
456 var_label,
457 var_format,
458 var_format_class,
459 storage_width,
460 display_width,
461 }
462 }
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468 use std::io::Write;
469
470 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
472 fn test_metadata(var_names: &[&str]) -> ReadStatMetadata {
473 let mut md = ReadStatMetadata::new();
474 for (i, name) in var_names.iter().enumerate() {
475 md.vars.insert(
476 i as i32,
477 ReadStatVarMetadata::new(
478 name.to_string(),
479 ReadStatVarType::Double,
480 ReadStatVarTypeClass::Numeric,
481 String::new(),
482 "BEST12".to_string(),
483 None,
484 8,
485 0,
486 ),
487 );
488 }
489 md.var_count = var_names.len() as i32;
490 md.schema = md.initialize_schema();
491 md
492 }
493
494 #[test]
497 fn resolve_columns_none_returns_none() {
498 let md = test_metadata(&["a", "b", "c"]);
499 assert!(md.resolve_selected_columns(None).unwrap().is_none());
500 }
501
502 #[test]
503 fn resolve_columns_valid_subset() {
504 let md = test_metadata(&["a", "b", "c"]);
505 let mapping = md
506 .resolve_selected_columns(Some(vec!["a".into(), "c".into()]))
507 .unwrap()
508 .unwrap();
509 assert_eq!(mapping.len(), 2);
510 assert_eq!(mapping[&0], 0);
512 assert_eq!(mapping[&2], 1);
514 }
515
516 #[test]
517 fn resolve_columns_invalid_name_errors() {
518 let md = test_metadata(&["a", "b", "c"]);
519 let err = md
520 .resolve_selected_columns(Some(vec!["a".into(), "nonexistent".into()]))
521 .unwrap_err();
522 match err {
523 ReadStatError::ColumnsNotFound {
524 requested,
525 available,
526 } => {
527 assert_eq!(requested, vec!["nonexistent"]);
528 assert_eq!(available, vec!["a", "b", "c"]);
529 }
530 other => panic!("Expected ColumnsNotFound, got {other:?}"),
531 }
532 }
533
534 #[test]
535 fn resolve_columns_all_columns() {
536 let md = test_metadata(&["x", "y", "z"]);
537 let mapping = md
538 .resolve_selected_columns(Some(vec!["x".into(), "y".into(), "z".into()]))
539 .unwrap()
540 .unwrap();
541 assert_eq!(mapping.len(), 3);
542 assert_eq!(mapping[&0], 0);
543 assert_eq!(mapping[&1], 1);
544 assert_eq!(mapping[&2], 2);
545 }
546
547 #[test]
550 fn filter_produces_contiguous_indices() {
551 let md = test_metadata(&["a", "b", "c", "d"]);
552 let mapping = md
553 .resolve_selected_columns(Some(vec!["b".into(), "d".into()]))
554 .unwrap()
555 .unwrap();
556 let filtered = md.filter_to_selected_columns(&mapping);
557
558 assert_eq!(filtered.var_count, 2);
559 assert_eq!(filtered.vars[&0].var_name, "b");
560 assert_eq!(filtered.vars[&1].var_name, "d");
561 }
562
563 #[test]
564 fn filter_preserves_schema() {
565 let md = test_metadata(&["a", "b", "c"]);
566 let mapping = md
567 .resolve_selected_columns(Some(vec!["b".into()]))
568 .unwrap()
569 .unwrap();
570 let filtered = md.filter_to_selected_columns(&mapping);
571
572 assert_eq!(filtered.schema.fields().len(), 1);
573 assert_eq!(filtered.schema.fields()[0].name(), "b");
574 }
575
576 #[test]
579 fn schema_string_type() {
580 let mut md = ReadStatMetadata::new();
581 md.vars.insert(
582 0,
583 ReadStatVarMetadata::new(
584 "name".into(),
585 ReadStatVarType::String,
586 ReadStatVarTypeClass::String,
587 String::new(),
588 "$30".into(),
589 None,
590 30,
591 0,
592 ),
593 );
594 md.var_count = 1;
595 let schema = md.initialize_schema();
596 assert_eq!(*schema.fields()[0].data_type(), DataType::Utf8);
597 }
598
599 #[test]
600 fn schema_float64_type() {
601 let mut md = ReadStatMetadata::new();
602 md.vars.insert(
603 0,
604 ReadStatVarMetadata::new(
605 "value".into(),
606 ReadStatVarType::Double,
607 ReadStatVarTypeClass::Numeric,
608 String::new(),
609 "BEST12".into(),
610 None,
611 8,
612 0,
613 ),
614 );
615 md.var_count = 1;
616 let schema = md.initialize_schema();
617 assert_eq!(*schema.fields()[0].data_type(), DataType::Float64);
618 }
619
620 #[test]
621 fn schema_date_type() {
622 let mut md = ReadStatMetadata::new();
623 md.vars.insert(
624 0,
625 ReadStatVarMetadata::new(
626 "dt".into(),
627 ReadStatVarType::Double,
628 ReadStatVarTypeClass::Numeric,
629 String::new(),
630 "DATE9".into(),
631 Some(ReadStatVarFormatClass::Date),
632 8,
633 0,
634 ),
635 );
636 md.var_count = 1;
637 let schema = md.initialize_schema();
638 assert_eq!(*schema.fields()[0].data_type(), DataType::Date32);
639 }
640
641 #[test]
642 fn schema_datetime_type() {
643 let mut md = ReadStatMetadata::new();
644 md.vars.insert(
645 0,
646 ReadStatVarMetadata::new(
647 "ts".into(),
648 ReadStatVarType::Double,
649 ReadStatVarTypeClass::Numeric,
650 String::new(),
651 "DATETIME22".into(),
652 Some(ReadStatVarFormatClass::DateTime),
653 8,
654 0,
655 ),
656 );
657 md.var_count = 1;
658 let schema = md.initialize_schema();
659 assert_eq!(
660 *schema.fields()[0].data_type(),
661 DataType::Timestamp(TimeUnit::Second, None)
662 );
663 }
664
665 #[test]
666 fn schema_time_type() {
667 let mut md = ReadStatMetadata::new();
668 md.vars.insert(
669 0,
670 ReadStatVarMetadata::new(
671 "tm".into(),
672 ReadStatVarType::Double,
673 ReadStatVarTypeClass::Numeric,
674 String::new(),
675 "TIME8".into(),
676 Some(ReadStatVarFormatClass::Time),
677 8,
678 0,
679 ),
680 );
681 md.var_count = 1;
682 let schema = md.initialize_schema();
683 assert_eq!(
684 *schema.fields()[0].data_type(),
685 DataType::Time32(TimeUnit::Second)
686 );
687 }
688
689 #[test]
690 fn schema_int32_type() {
691 let mut md = ReadStatMetadata::new();
692 md.vars.insert(
693 0,
694 ReadStatVarMetadata::new(
695 "count".into(),
696 ReadStatVarType::Int32,
697 ReadStatVarTypeClass::Numeric,
698 String::new(),
699 String::new(),
700 None,
701 4,
702 0,
703 ),
704 );
705 md.var_count = 1;
706 let schema = md.initialize_schema();
707 assert_eq!(*schema.fields()[0].data_type(), DataType::Int32);
708 }
709
710 #[test]
711 fn schema_with_labels_metadata() {
712 let mut md = ReadStatMetadata::new();
713 md.vars.insert(
714 0,
715 ReadStatVarMetadata::new(
716 "col".into(),
717 ReadStatVarType::Double,
718 ReadStatVarTypeClass::Numeric,
719 "My Label".into(),
720 "BEST12".into(),
721 None,
722 8,
723 0,
724 ),
725 );
726 md.var_count = 1;
727 md.file_label = "My Table".into();
728 let schema = md.initialize_schema();
729
730 let field_meta = schema.fields()[0].metadata();
732 assert_eq!(field_meta.get("label").unwrap(), "My Label");
733
734 let schema_meta = schema.metadata();
736 assert_eq!(schema_meta.get("table_label").unwrap(), "My Table");
737 }
738
739 #[test]
740 fn schema_no_labels_has_format_and_width_metadata() {
741 let mut md = ReadStatMetadata::new();
742 md.vars.insert(
743 0,
744 ReadStatVarMetadata::new(
745 "col".into(),
746 ReadStatVarType::Double,
747 ReadStatVarTypeClass::Numeric,
748 String::new(),
749 "BEST12".into(),
750 None,
751 8,
752 0,
753 ),
754 );
755 md.var_count = 1;
756 let schema = md.initialize_schema();
757
758 let field_meta = schema.fields()[0].metadata();
759 assert!(!field_meta.contains_key("label"));
760 assert_eq!(field_meta.get("sas_format").unwrap(), "BEST12");
761 assert_eq!(field_meta.get("storage_width").unwrap(), "8");
762 assert!(!field_meta.contains_key("display_width"));
763 assert!(schema.metadata().is_empty());
764 }
765
766 #[test]
769 fn parse_columns_file_normal() {
770 let dir = tempfile::tempdir().unwrap();
771 let path = dir.path().join("cols.txt");
772 let mut f = File::create(&path).unwrap();
773 writeln!(f, "col_a").unwrap();
774 writeln!(f, "col_b").unwrap();
775 writeln!(f, "col_c").unwrap();
776
777 let names = ReadStatMetadata::parse_columns_file(&path).unwrap();
778 assert_eq!(names, vec!["col_a", "col_b", "col_c"]);
779 }
780
781 #[test]
782 fn parse_columns_file_with_comments_and_blanks() {
783 let dir = tempfile::tempdir().unwrap();
784 let path = dir.path().join("cols.txt");
785 let mut f = File::create(&path).unwrap();
786 writeln!(f, "# This is a comment").unwrap();
787 writeln!(f, "col_a").unwrap();
788 writeln!(f).unwrap();
789 writeln!(f, " col_b ").unwrap();
790 writeln!(f, "# Another comment").unwrap();
791
792 let names = ReadStatMetadata::parse_columns_file(&path).unwrap();
793 assert_eq!(names, vec!["col_a", "col_b"]);
794 }
795
796 #[test]
797 fn parse_columns_file_empty() {
798 let dir = tempfile::tempdir().unwrap();
799 let path = dir.path().join("cols.txt");
800 File::create(&path).unwrap();
801
802 let names = ReadStatMetadata::parse_columns_file(&path).unwrap();
803 assert!(names.is_empty());
804 }
805
806 #[test]
807 fn parse_columns_file_nonexistent() {
808 let path = Path::new("/nonexistent/path/cols.txt");
809 assert!(ReadStatMetadata::parse_columns_file(path).is_err());
810 }
811
812 #[test]
815 fn default_metadata() {
816 let md = ReadStatMetadata::new();
817 assert_eq!(md.row_count, 0);
818 assert_eq!(md.var_count, 0);
819 assert!(md.table_name.is_empty());
820 assert!(md.vars.is_empty());
821 assert!(md.schema.fields().is_empty());
822 }
823}