Source code for lovelyrita.utils

from __future__ import print_function
import numpy as np
import pandas as pd


[docs]def get_column_report(df): """Generate a summary of the data in a DataFrame """ column_report = [] for column in df.columns: unique = df[column].unique() sample = np.nan for value in unique: if value is not np.nan: sample = value break nans = df[column].isnull().sum() pct_nan = 100. * nans / df.shape[0] column_report.append([column, df[column].dtype, len(unique), sample, nans, pct_nan]) columns = ["Column Name", "Data Type", "Unique Count", "Sample Value", "NaNs", "% NaN"] column_report = pd.DataFrame(column_report, columns=columns).round(2) column_report.sort_values(by="NaNs", inplace=True)
return column_report
[docs]def get_uniques(df): """Return the unique values for each column """ for column in df.columns:
print(column, df[column].unique())
[docs]def get_addresses(df): addresses = set() for i, item in (df["Street"] + " " + df["City"] + " " + df["State"]).iteritems(): addresses.add(" ".join(item.lower().split()))
return list(addresses)
[docs]def output_addresses(df, file_out): """ """ addresses = get_addresses(df) with open(file_out, 'w') as output: for address in addresses: output.write(address + '\n')
return addresses