Source code for lovelyrita.data

from __future__ import print_function
import numpy as np
import pandas as pd
from shapely.geometry import Point
import geopandas
from lovelyrita.clean import clean as clean_data
from lovelyrita.config import VALID_COLUMN_NAMES as valid_column_names


[docs]def read_data(paths, usecols=None, delimiter=',', clean=False): """Load data from a list of file paths. Parameters ---------- paths : list A list of file paths to the data to be loaded dtype : dict A dict containing key (column name) and value (data type) delimiter : str Returns ------- A DataFrame containing the loaded data """ if not isinstance(paths, (tuple, list)): paths = [paths, ] dataframe = [] for path in paths: if usecols is None: usecols = get_column_names(path) df = pd.read_csv(path, dtype='str', usecols=usecols, delimiter=delimiter) df['street'] = df['street'].str.strip(' ') if clean: df = clean_data(df) dataframe.append(df) dataframe = pd.concat(dataframe).reset_index(drop=True)
return dataframe
[docs]def get_column_names(path, valid_column_names=valid_column_names): """Return the intersection of columns present in the dataset and valid column names Parameters: ----------- path : str valid_column_names : list of st Return: ------- """ column_names = pd.read_csv(path, nrows=1)
return [n for n in column_names if n in valid_column_names]
[docs]def to_geodataframe(dataframe, copy=False, drop_null_geometry=True, projection='epsg:4326'): """Convert a pandas DataFrame to geopandas DataFrame. Parameters ---------- dataframe : pandas.DataFrame Must contain latitude and longitude fields copy : bool drop_null_geometry : bool projection : str Returns ------- A GeoDataFrame of the given DataFrame """ if copy: df = dataframe.copy() else: df = dataframe df.latitude = df.latitude.astype('float32') df.longitude = df.longitude.astype('float32') points = [] for x, y in zip(df.latitude, df.longitude): if not x == 0: points.append(Point(y, x)) else: points.append(None) df['geometry'] = points df.drop(['latitude', 'longitude'], axis=1, inplace=True) if drop_null_geometry: df = df.loc[~df.geometry.isnull()] # geopandas cannot handle datetime formats, so convert to string for column in df.select_dtypes(include=['datetime']): df[column] = df[column].dt.strftime('%m/%d/%y %H:%M:%S')
return geopandas.GeoDataFrame(df, geometry='geometry', crs={'init': projection})
[docs]def write_shapefile(geodataframe, path): """Write a geodataframe to a shapefile. Parameters ---------- geodataframe : geopandas.GeoDataFrame path : str """
geodataframe.to_file(path, driver='ESRI Shapefile')
[docs]def get_sample_value(series): """Return a sample value from a series Parameters ---------- series : pandas.Series Returns ------- A sample value from the series or None if all values in the series are null """ unique = series.unique() for value in unique: if value is not np.nan:
return value
[docs]def summarize(dataframe): """Generate a summary of the data in a dataframe. Parameters ---------- dataframe : pandas.DataFrame Returns ------- A DataFrame containing the data type, number of unique values, a sample value, number and percent of null values """ column_report = [] for column in dataframe.columns: unique = dataframe[column].unique() sample = get_sample_value(dataframe[column]) n_null = dataframe[column].isnull().sum() pct_null = 100. * n_null / dataframe.shape[0] r = [column, dataframe[column].dtype, len(unique), sample, n_null, pct_null] column_report.append(r) columns = ["Column Name", "Data Type", "Unique Count", "Sample Value", "null", "% null"] column_report = pd.DataFrame(column_report, columns=columns).round(2) column_report.sort_values(by="null", inplace=True)
return column_report