Skip to content

Commit 1240501

Browse files
authored
Disregard SAS and SPSS format width and precision when identifying date, datetime or time variables (#332)
* Disregard SAS and SPSS format width and precision when identifying date, datetime or time variables
1 parent 434e9fd commit 1240501

1 file changed

Lines changed: 43 additions & 36 deletions

File tree

pyreadstat/_readstat_parser.pyx

Lines changed: 43 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import datetime
2929
import os
3030
import warnings
3131
import sys
32+
import re
3233

3334
import narwhals.stable.v2 as nw
3435
import numpy as np
@@ -42,27 +43,27 @@ import_datetime()
4243

4344
cdef object unix_origin = datetime_new(1970, 1, 1, 0, 0, 0, 0, None)
4445

45-
cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "DATE9", "YYMMDD10",
46-
"DDMMYYB", "DDMMYYB10", "DDMMYYC", "DDMMYYC10", "DDMMYYD", "DDMMYYD10",
47-
"DDMMYYN6", "DDMMYYN8", "DDMMYYP", "DDMMYYP10", "DDMMYYS", "DDMMYYS10",
48-
"MMDDYYB", "MMDDYYB10", "MMDDYYC", "MMDDYYC10", "MMDDYYD", "MMDDYYD10",
49-
"MMDDYYN6", "MMDDYYN8", "MMDDYYP", "MMDDYYP10", "MMDDYYS", "MMDDYYS10",
46+
cdef object format_regex = re.compile(r"^([A-Z][A-Z0-9]+[A-Z])(\d+)?(?(2)(?:\.\d+)?$|$)")
47+
cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "YYMMDD",
48+
"DDMMYYB", "DDMMYYC", "DDMMYYD",
49+
"DDMMYYN", "DDMMYYP", "DDMMYYS",
50+
"MMDDYYB", "MMDDYYC", "MMDDYYD",
51+
"MMDDYYN", "MMDDYYP", "MMDDYYS",
5052
#"MONNAME", "MONTH", "WEEKDAY", "QTR", "QTRR", "YEAR","DAY", "DOWNAME" # these do not print as full dates in sas
5153
"WEEKDATX", "DTDATE",
5254
"IS8601DA", "E8601DA", "B8601DA",
53-
"YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS",]
54-
cdef list sas_datetime_formats = ["DATETIME", "DATETIME18", "DATETIME19", "DATETIME20", "DATETIME21", "DATETIME22",
55-
"E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"]
56-
cdef list sas_time_formats = ["TIME", "HHMM", "TIME20.3", "TIME20", "TIME5", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM", ]
55+
"YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS"]
56+
cdef list sas_datetime_formats = ["DATETIME", "E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"]
57+
cdef list sas_time_formats = ["TIME", "HHMM", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM"]
5758
# "HOUR" # these do not print as full time formats in sas
5859
#cdef list sas_all_formats = sas_date_formats + sas_datetime_formats + sas_time_formats
5960
cdef list sas_all_formats
6061
cdef object sas_origin = datetime_new(1960, 1, 1, 0, 0, 0, 0, None)
6162
cdef object sas_secs_from_unix = total_seconds(unix_origin - sas_origin)
6263

63-
cdef list spss_datetime_formats = ["DATETIME", "DATETIME8", 'DATETIME17', 'DATETIME20', 'DATETIME23.2',"YMDHMS16","YMDHMS19","YMDHMS19.2", "YMDHMS20"]
64-
cdef list spss_date_formats = ["DATE",'DATE8','DATE11', 'DATE12', "ADATE","ADATE8", "ADATE10", "EDATE", 'EDATE8','EDATE10', "JDATE", "JDATE5", "JDATE7", "SDATE", "SDATE8", "SDATE10",]
65-
cdef list spss_time_formats = ["TIME", "DTIME", 'TIME8', 'TIME5', 'TIME11.2']
64+
cdef list spss_datetime_formats = ["DATETIME", "YMDHMS"]
65+
cdef list spss_date_formats = ["DATE", "ADATE", "EDATE", "JDATE", "SDATE"]
66+
cdef list spss_time_formats = ["TIME", "DTIME"]
6667
#cdef list spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats
6768
cdef list spss_all_formats
6869
cdef object spss_origin = datetime_new(1582, 10, 14, 0, 0, 0, 0, None)
@@ -148,26 +149,32 @@ cdef py_datetime_format transform_variable_format(str var_format, py_file_format
148149
Transforms a readstat var_format to a date, datetime or time format label
149150
"""
150151
if file_format == FILE_FORMAT_SAS:
151-
if var_format in sas_all_formats:
152-
if var_format in sas_date_formats:
153-
return DATE_FORMAT_DATE
154-
elif var_format in sas_datetime_formats:
155-
return DATE_FORMAT_DATETIME
156-
elif var_format in sas_time_formats:
157-
return DATE_FORMAT_TIME
158-
else:
159-
return DATE_FORMAT_NOTADATE
152+
if var_format:
153+
format_match = format_regex.match(var_format)
154+
if format_match:
155+
var_format_name = format_match.group(1)
156+
if var_format_name in sas_all_formats:
157+
if var_format_name in sas_date_formats:
158+
return DATE_FORMAT_DATE
159+
elif var_format_name in sas_datetime_formats:
160+
return DATE_FORMAT_DATETIME
161+
elif var_format_name in sas_time_formats:
162+
return DATE_FORMAT_TIME
163+
return DATE_FORMAT_NOTADATE
160164

161165
elif file_format == FILE_FORMAT_SPSS:
162-
if var_format in spss_all_formats:
163-
if var_format in spss_date_formats:
164-
return DATE_FORMAT_DATE
165-
elif var_format in spss_datetime_formats:
166-
return DATE_FORMAT_DATETIME
167-
elif var_format in spss_time_formats:
168-
return DATE_FORMAT_TIME
169-
else:
170-
return DATE_FORMAT_NOTADATE
166+
if var_format:
167+
format_match = format_regex.match(var_format)
168+
if format_match:
169+
var_format_name = format_match.group(1)
170+
if var_format_name in spss_all_formats:
171+
if var_format_name in spss_date_formats:
172+
return DATE_FORMAT_DATE
173+
elif var_format_name in spss_datetime_formats:
174+
return DATE_FORMAT_DATETIME
175+
elif var_format_name in spss_time_formats:
176+
return DATE_FORMAT_TIME
177+
return DATE_FORMAT_NOTADATE
171178

172179
elif file_format == FILE_FORMAT_STATA:
173180
if var_format in stata_all_formats:
@@ -1267,27 +1274,27 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
12671274

12681275
if extra_date_formats is not None:
12691276
if file_format == FILE_FORMAT_SAS:
1270-
sas_date_formats.extend(extra_date_formats)
1277+
sas_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)])
12711278
elif file_format == FILE_FORMAT_SPSS:
1272-
spss_date_formats.extend(extra_date_formats)
1279+
spss_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)])
12731280
elif file_format == FILE_FORMAT_STATA:
12741281
stata_date_formats.extend(extra_date_formats)
12751282
else:
12761283
raise PyreadstatError("Unknown file format")
12771284
if extra_datetime_formats is not None:
12781285
if file_format == FILE_FORMAT_SAS:
1279-
sas_datetime_formats.extend(extra_datetime_formats)
1286+
sas_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)])
12801287
elif file_format == FILE_FORMAT_SPSS:
1281-
spss_datetime_formats.extend(extra_datetime_formats)
1288+
spss_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)])
12821289
elif file_format == FILE_FORMAT_STATA:
12831290
stata_datetime_formats.extend(extra_datetime_formats)
12841291
else:
12851292
raise PyreadstatError("Unknown file format")
12861293
if extra_time_formats is not None:
12871294
if file_format == FILE_FORMAT_SAS:
1288-
sas_time_formats.extend(extra_time_formats)
1295+
sas_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)])
12891296
elif file_format == FILE_FORMAT_SPSS:
1290-
spss_time_formats.extend(extra_time_formats)
1297+
spss_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)])
12911298
elif file_format == FILE_FORMAT_STATA:
12921299
stata_time_formats.extend(extra_time_formats)
12931300
else:

0 commit comments

Comments
 (0)