Source code for pandect.pandect
#!/usr/bin/env python3
import logging
import os
import re
import sys
import pandas
import pyreadstat
########################################################################
[docs]def expand_path(x):
"""Helper function to expand ~ and environment variables in paths"""
x = os.path.expandvars(os.path.expanduser(x))
logging.debug(f"expanded: {x}")
return x
[docs]def load(source,
sep=',',
expand=True,
flags=re.IGNORECASE,
table=None,
):
"""Load dataset into pandas.DataFrame object
Uses file extension as heuristic to determine input format.
Supports: csv, tsv, xlsx, sav, dta (unreliable), sqlite3
Parameters
----------
sep : str
Separator used by csv
expand : true
Expand ~ and environment variables in path strings
flags : re.RegexFlag
Regular expression flags for matching file name extensions
table : str
Name of table to load (needed for some database input sources)
Returns
-------
data : pandas.DataFrame
DataFrame object
meta : pyreadstat.metadata_container
Metadata (empty if not provided by data source)
Raises
------
FileNotFoundError
IOError
Notes
-----
Loading dta files is unreliable (bug in pyreadstat, might segfault)
Metadata Objects
----------------
Incomplete list of metadata:
- column_names : list with the names of the columns
- column_labels : list with the column labels, if any
- column_names_to_labels : dict{column_names: column_labels}
- variable_value_labels : dict{variable_names: dict}
- variable_to_label : dict{variable_names: label_name}
- value_labels : dict{label_name: dict}
- variable_measure : nominal, ordinal, scale or unknown
See the pyreadstat web docs for complete spec.
"""
meta = pyreadstat.metadata_container()
if type(source) is str:
logging.info(f"data source: {source}")
if expand:
source = expand_path(source)
if not os.path.exists(source):
logging.error(f"file not found: {source}")
raise FileNotFoundError(source)
if re.search('\.csv$', source, flags):
data = pandas.read_csv(source, sep=sep)
elif re.search('\.tsv$', source, flags):
data = pandas.read_csv(source, sep='\t')
elif re.search('\.xlsx$', source, flags):
data = pandas.read_excel(source)
elif re.search('\.sav$', source, flags):
data, meta = pyreadstat.read_sav(source)
elif re.search('\.dta$', source, flags):
logging.warning("loading dta files known to cause segfaults")
data, meta = pyreadstat.read_dta(source)
elif re.search('\.sqlite3$', source, flags):
if table is None:
message = "missing table specification for sqlite"
logging.error(message)
raise IOError(message)
connection = sqlite3.connect(source)
query = "SELECT * FROM %s" % (table)
data = pandas.read_sql_query(query, connection)
else:
message = f"unrecognized file type {source}"
logging.error(message)
raise IOError(message)
else:
message = f"unrecognized data source {source}"
logging.error(message)
raise IOError(message)
vars = list(data)
logging.info('loaded data')
logging.info(f"number of variables: {len(vars)}")
logging.info(f"observations: {len(data)}")
return(data, meta)