Source code for lovelyrita.addresses

from six import string_types
import re
import pandas as pd

# use sparingly, these take a lot of time to evaluate
REPLACEMENTS = [(r'^ONE ', '1 '),
                (r'^TWO ', '2 '),
                (' -', '-'),
                (r' TERM$', ' TERMINAL'),
                (r'^#', '')]


VALID_SUFFIXES = ['AVE', 'AVEN', 'BLOCK', 'BLVD', 'BOULEVARD', 'CIR', 'COURT',
                  'CREST', 'CREEK', 'CRK', 'DR', 'DRI', 'DRIVE', 'HBR', 'HTS',
                  'LANE', 'LOOP', 'PARKWAY', 'PKWAY', 'PKWY', 'PL', 'PLACE',
                  'PLAZA', 'PLZ', 'R', 'ROAD', 'SR', 'ST', 'STREET',
                  'TERR', 'TERRACE', 'VISTA', 'VW', 'WAY', 'WY']


[docs]def replace(addresses, replacements=REPLACEMENTS, inplace=True): """Replace text in addresses Parameters ---------- addresses : pandas.Series replacements : tuple list of tuples Replacements provided as (pattern to replace, replacement). Multiple replacements can be provided as a list of tuples inplace : bool Returns ------- If inplace is False, returns the address series with the replacements made. """ if isinstance(replacements[0], string_types): replacements = [replacements, ] if inplace: addr = addresses else: addr = addresses.copy() for pattern, replacement in REPLACEMENTS: addr.replace(pattern, replacement, regex=True, inplace=True) if not inplace:
return addr
[docs]def parse_123_main_street(addresses): """Parse the common address format, e.g. 123 MAIN STREET Parameter --------- addresses : pandas.Series Returns ------- A DataFrame containing street name and street column for those rows that were successfully parsed """ patt = re.compile(r'^(?P<street_no>\d+\-?\d?) (?P<street_name>[\w\d\s]+)') street = addresses.str.extract(patt, expand=True) street.dropna(inplace=True)
return street
[docs]def parse_P123_main_street(addresses): """Parse addresses that contain a prefix, e.g., P123-1 PARK STREET Parameter --------- addresses : pandas.Series Returns ------- A DataFrame containing street name and street column for those rows that were successfully parsed """ patt = re.compile(r'^(?P<prefix>[A-Z]+)\-?(?P<street_number>\d+[\-\W]?\d?) ' '(?P<street_name>[\w\d\s]+)') street = addresses.str.extract(patt, expand=True) street.dropna(inplace=True) drop_indices = [] for i, s in street.iterrows(): street_words = s.street_name.split(' ') drop = True for street_word in street_words: if street_word.startswith(s.prefix[0]): drop = False if drop: drop_indices.append(i) street.drop(drop_indices, inplace=True)
return street
[docs]def parse_addresses(addresses): """Parse addresses into street name and number according to several rules. Parameter --------- addresses : pandas.Series Returns ------- A DataFrame containing street name and street column for those rows that were successfully parsed """ # Many addresses are in parking lots. Those will not have street numbers, so we should treat them separately. We will only concern ourselves with potential street addresses. lot_indices = addresses.str.contains('^[A-Z]LOT.*LOT$') street_addresses = addresses.loc[~lot_indices] street = pd.DataFrame({'street_name': None, 'street_number': None}, index=addresses.index) new_street = parse_123_main_street(street_addresses) street.update(new_street) new_street = parse_P123_main_street(street_addresses) street.update(new_street)
return street
[docs]def clean_street_suffixes(street): import numpy as np import difflib candidates = ['AVENUE', 'DRIVE', 'BLOCK', 'PLACE', 'STREET'] def get_suffix_match(s): if isinstance(s, np.float): return match = difflib.get_close_matches(s, candidates, n=1, cutoff=0.81) if len(match) == 1: return match[0] else: return street = street.street_suffix.apply(get_suffix_match)
return street[~pd.isnull(street)]