From 606e0d6372392183af23ab162755d8714361a417 Mon Sep 17 00:00:00 2001 From: Scott Gasch Date: Sat, 30 Apr 2022 17:24:53 -0700 Subject: [PATCH] Batch geocoding. Also use format=json for the single address request to remove the need for beautifulsoup. --- geocode.py | 95 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/geocode.py b/geocode.py index 428eb58..c2f5ead 100644 --- a/geocode.py +++ b/geocode.py @@ -3,36 +3,35 @@ # © Copyright 2022, Scott Gasch """Wrapper around US Census address geocoder API described here: -https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf""" +https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf +https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf +""" import logging -import re -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import requests -from bs4 import BeautifulSoup from requests.utils import requote_uri -import string_utils - logger = logging.getLogger(__name__) def geocode_address(address: str) -> Optional[Dict[str, Any]]: - """Send a single address to the US Census geocoding API. + """Send a single address to the US Census geocoding API. The response + is a parsed JSON chunk of data with N addressMatches in the result + section and the details of each match within it. Returns None on error. - >>> out = geocode_address('4600 Silver Hill Rd,, 20233') - >>> out['Matched Address'] + >>> json = geocode_address('4600 Silver Hill Rd,, 20233') + >>> json['result']['addressMatches'][0]['matchedAddress'] '4600 SILVER HILL RD, WASHINGTON, DC, 20233' - >>> out['Interpolated Longitude (X) Coordinates'] - -76.92743 - >>> out['Interpolated Latitude (Y) Coordinates'] - 38.84599 + + >>> json['result']['addressMatches'][0]['coordinates'] + {'x': -76.92743, 'y': 38.84599} """ url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress' url += f'?address={address}' - url += '&layers=all&benchmark=4&vintage=4' + url += '&returntype=geographies&layers=all&benchmark=4&vintage=4&format=json' url = requote_uri(url) logger.debug('GET: %s', url) try: @@ -44,27 +43,53 @@ def geocode_address(address: str) -> Optional[Dict[str, Any]]: if r.status_code != 200: logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) return None - else: - soup = BeautifulSoup(r.text, 'html.parser') - result = soup.find('div', id='pl_gov_census_geo_geocoder_domain_AddressResult') - logger.debug('Unhelpful result blurb: "%s"', result) - output = result.get_text('\n') - label = None - out = {} - for line in output.split('\n'): - if re.match(r'.*: *$', line): - line = line.strip() - label = line[:-1] - logger.debug('Label is: "%s"', label) - else: - if label: - value = line.strip() - if string_utils.is_integer_number(value): - value = int(value) - elif string_utils.is_number(value): - value = float(value) - logger.debug('Value is: "%s"', value) - out[label] = value + # print(json.dumps(r.json(), indent=4, sort_keys=True)) + return r.json() + + +def batch_geocode_addresses(addresses: List[str]): + """Send up to addresses for batch geocoding. Each line of the input + list should be a single address of the form: STREET ADDRESS, CITY, + STATE, ZIP. Components may be omitted but the commas may not be. + Result is an array of the same size as the input array with one + answer record per line. Returns None on error. + + >>> batch_geocode_addresses( + ... [ + ... '4600 Silver Hill Rd, Washington, DC, 20233', + ... '935 Pennsylvania Avenue, NW, Washington, DC, 20535-0001', + ... '1600 Pennsylvania Avenue NW, Washington, DC, 20500', + ... '700 Pennsylvania Avenue NW, Washington, DC, 20408', + ... ] + ... ) + ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"'] + """ + + # TODO: use list_utils.shard to break up the input if it's >10k records + # b/c the census website has a hard limit at 10k. + + url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch' + payload = {'benchmark': '4', 'vintage': '4'} + raw_file = '' + for n, address in enumerate(addresses): + raw_file += f'{n+1}, {address}\n' + files = {'addressFile': ('input.csv', raw_file)} + logger.debug('POST: %s', url) + try: + r = requests.post(url, files=files, data=payload) + except Exception as e: + logger.exception(e) + return None + if r.status_code != 200: + print(r.text) + logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) + return None + + out = [] + for line in r.text.split('\n'): + line = line.strip() + if len(line) > 0: + out.append(line) return out -- 2.46.0