Source code for lovelyrita.data

from __future__ import print_function
import numpy as np
import pandas as pd
from shapely.geometry import Point
import geopandas
from lovelyrita.clean import clean as clean_data
from lovelyrita.config import VALID_COLUMN_NAMES as valid_column_names


[docs]def read_data(paths, usecols=None, delimiter=',', clean=False):
    """Load data from a list of file paths.

    Parameters
    ----------
    paths : list
        A list of file paths to the data to be loaded
    dtype : dict
        A dict containing key (column name) and value (data type)
    delimiter : str

    Returns
    -------
    A DataFrame containing the loaded data
    """
    if not isinstance(paths, (tuple, list)):
        paths = [paths, ]

    dataframe = []
    for path in paths:

        if usecols is None:
            usecols = get_column_names(path)

        df = pd.read_csv(path, dtype='str', usecols=usecols,
                         delimiter=delimiter)

        df['street'] = df['street'].str.strip(' ')

        if clean:
            df = clean_data(df)

        dataframe.append(df)

    dataframe = pd.concat(dataframe).reset_index(drop=True)

    return dataframe


[docs]def get_column_names(path, valid_column_names=valid_column_names):
    """Return the intersection of columns present in the dataset and valid column names

    Parameters:
    -----------
    path : str
    valid_column_names : list of st

    Return:
    -------
    """
    column_names = pd.read_csv(path, nrows=1)
    return [n for n in column_names if n in valid_column_names]


[docs]def to_geodataframe(dataframe, copy=False, drop_null_geometry=True,
                    projection='epsg:4326'):
    """Convert a pandas DataFrame to geopandas DataFrame.

    Parameters
    ----------
    dataframe : pandas.DataFrame
        Must contain latitude and longitude fields
    copy : bool
    drop_null_geometry : bool
    projection : str

    Returns
    -------
    A GeoDataFrame of the given DataFrame
    """
    if copy:
        df = dataframe.copy()
    else:
        df = dataframe

    df.latitude = df.latitude.astype('float32')
    df.longitude = df.longitude.astype('float32')

    points = []
    for x, y in zip(df.latitude, df.longitude):
        if not x == 0:
            points.append(Point(y, x))
        else:
            points.append(None)
    df['geometry'] = points

    df.drop(['latitude', 'longitude'], axis=1, inplace=True)

    if drop_null_geometry:
        df = df.loc[~df.geometry.isnull()]

    # geopandas cannot handle datetime formats, so convert to string
    for column in df.select_dtypes(include=['datetime']):
        df[column] = df[column].dt.strftime('%m/%d/%y %H:%M:%S')

    return geopandas.GeoDataFrame(df, geometry='geometry', crs={'init': projection})


[docs]def write_shapefile(geodataframe, path):
    """Write a geodataframe to a shapefile.

    Parameters
    ----------
    geodataframe : geopandas.GeoDataFrame
    path : str
    """
    geodataframe.to_file(path, driver='ESRI Shapefile')


[docs]def get_sample_value(series):
    """Return a sample value from a series

    Parameters
    ----------
    series : pandas.Series

    Returns
    -------
    A sample value from the series or None if all values in the series are null
    """
    unique = series.unique()
    for value in unique:
        if value is not np.nan:
            return value


[docs]def summarize(dataframe):
    """Generate a summary of the data in a dataframe.

    Parameters
    ----------
    dataframe : pandas.DataFrame

    Returns
    -------
    A DataFrame containing the data type, number of unique values, a sample value, number and
    percent of null values
    """
    column_report = []
    for column in dataframe.columns:
        unique = dataframe[column].unique()
        sample = get_sample_value(dataframe[column])
        n_null = dataframe[column].isnull().sum()
        pct_null = 100. * n_null / dataframe.shape[0]
        r = [column, dataframe[column].dtype, len(unique), sample, n_null, pct_null]
        column_report.append(r)

    columns = ["Column Name", "Data Type", "Unique Count", "Sample Value", "null", "% null"]
    column_report = pd.DataFrame(column_report, columns=columns).round(2)
    column_report.sort_values(by="null", inplace=True)

    return column_report