Source code for pandect.pandect

#!/usr/bin/env python3

import logging
import os
import re
import sys

import pandas
import pyreadstat

########################################################################

[docs]def expand_path(x):
    """Helper function to expand ~ and environment variables in paths"""
    x = os.path.expandvars(os.path.expanduser(x))
    logging.debug(f"expanded: {x}")
    return x

[docs]def load(source,
    sep=',',
    expand=True,
    flags=re.IGNORECASE,
    table=None,
):
    """Load dataset into pandas.DataFrame object

    Uses file extension as heuristic to determine input format.

    Supports: csv, tsv, xlsx, sav, dta (unreliable), sqlite3

    Parameters
    ----------
    sep : str
        Separator used by csv
    expand : true
        Expand ~ and environment variables in path strings
    flags : re.RegexFlag
        Regular expression flags for matching file name extensions
    table : str
        Name of table to load (needed for some database input sources)

    Returns
    -------
    data : pandas.DataFrame
        DataFrame object
    meta : pyreadstat.metadata_container
        Metadata (empty if not provided by data source)

    Raises
    ------
    FileNotFoundError
    IOError

    Notes
    -----
    Loading dta files is unreliable (bug in pyreadstat, might segfault)

    Metadata Objects
    ----------------

    Incomplete list of metadata:

    - column_names : list with the names of the columns
    - column_labels : list with the column labels, if any
    - column_names_to_labels : dict{column_names: column_labels}
    - variable_value_labels : dict{variable_names: dict}
    - variable_to_label : dict{variable_names: label_name}
    - value_labels : dict{label_name: dict}
    - variable_measure : nominal, ordinal, scale or unknown

    See the pyreadstat web docs for complete spec.
    """

    meta = pyreadstat.metadata_container()

    if type(source) is str:
        logging.info(f"data source: {source}")

        if expand:
            source = expand_path(source)
        if not os.path.exists(source):
            logging.error(f"file not found: {source}")
            raise FileNotFoundError(source)

        if re.search('\.csv$', source, flags):
            data = pandas.read_csv(source, sep=sep)
        elif re.search('\.tsv$', source, flags):
            data = pandas.read_csv(source, sep='\t')
        elif re.search('\.xlsx$', source, flags):
            data = pandas.read_excel(source)
        elif re.search('\.sav$', source, flags):
            data, meta = pyreadstat.read_sav(source)
        elif re.search('\.dta$', source, flags):
            logging.warning("loading dta files known to cause segfaults")
            data, meta = pyreadstat.read_dta(source)
        elif re.search('\.sqlite3$', source, flags):
            if table is None:
                message = "missing table specification for sqlite"
                logging.error(message)
                raise IOError(message)
            connection = sqlite3.connect(source)
            query = "SELECT * FROM %s" % (table)
            data = pandas.read_sql_query(query, connection)
        else:
            message = f"unrecognized file type {source}"
            logging.error(message)
            raise IOError(message)
    else:
        message = f"unrecognized data source {source}"
        logging.error(message)
        raise IOError(message)

    vars = list(data)
    logging.info('loaded data')
    logging.info(f"number of variables: {len(vars)}")
    logging.info(f"observations: {len(data)}")
    return(data, meta)
Source code for pandect.pandect

pandect

Navigation

Related Topics