Batch geocoding. Also use format=json for the single address request to
authorScott Gasch <[email protected]>
Sun, 1 May 2022 00:24:53 +0000 (17:24 -0700)
committerScott Gasch <[email protected]>
Sun, 1 May 2022 00:24:53 +0000 (17:24 -0700)
remove the need for beautifulsoup.

geocode.py

index 428eb58ccb793f6cf4508aca16b5d4c202a45257..c2f5ead9bfa88576374d9abb5167eaf8b349683e 100644 (file)
@@ -3,36 +3,35 @@
 # © Copyright 2022, Scott Gasch
 
 """Wrapper around US Census address geocoder API described here:
-https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf"""
+https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf
+https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
+"""
 
 import logging
-import re
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import requests
-from bs4 import BeautifulSoup
 from requests.utils import requote_uri
 
-import string_utils
-
 logger = logging.getLogger(__name__)
 
 
 def geocode_address(address: str) -> Optional[Dict[str, Any]]:
-    """Send a single address to the US Census geocoding API.
+    """Send a single address to the US Census geocoding API.  The response
+    is a parsed JSON chunk of data with N addressMatches in the result
+    section and the details of each match within it.  Returns None on error.
 
-    >>> out = geocode_address('4600 Silver Hill Rd,, 20233')
-    >>> out['Matched Address']
+    >>> json = geocode_address('4600 Silver Hill Rd,, 20233')
+    >>> json['result']['addressMatches'][0]['matchedAddress']
     '4600 SILVER HILL RD, WASHINGTON, DC, 20233'
-    >>> out['Interpolated Longitude (X) Coordinates']
-    -76.92743
-    >>> out['Interpolated Latitude (Y) Coordinates']
-    38.84599
+
+    >>> json['result']['addressMatches'][0]['coordinates']
+    {'x': -76.92743, 'y': 38.84599}
 
     """
     url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress'
     url += f'?address={address}'
-    url += '&layers=all&benchmark=4&vintage=4'
+    url += '&returntype=geographies&layers=all&benchmark=4&vintage=4&format=json'
     url = requote_uri(url)
     logger.debug('GET: %s', url)
     try:
@@ -44,27 +43,53 @@ def geocode_address(address: str) -> Optional[Dict[str, Any]]:
     if r.status_code != 200:
         logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
         return None
-    else:
-        soup = BeautifulSoup(r.text, 'html.parser')
-        result = soup.find('div', id='pl_gov_census_geo_geocoder_domain_AddressResult')
-        logger.debug('Unhelpful result blurb: "%s"', result)
-        output = result.get_text('\n')
-        label = None
-        out = {}
-        for line in output.split('\n'):
-            if re.match(r'.*: *$', line):
-                line = line.strip()
-                label = line[:-1]
-                logger.debug('Label is: "%s"', label)
-            else:
-                if label:
-                    value = line.strip()
-                    if string_utils.is_integer_number(value):
-                        value = int(value)
-                    elif string_utils.is_number(value):
-                        value = float(value)
-                    logger.debug('Value is: "%s"', value)
-                    out[label] = value
+    # print(json.dumps(r.json(), indent=4, sort_keys=True))
+    return r.json()
+
+
+def batch_geocode_addresses(addresses: List[str]):
+    """Send up to addresses for batch geocoding.  Each line of the input
+    list should be a single address of the form: STREET ADDRESS, CITY,
+    STATE, ZIP.  Components may be omitted but the commas may not be.
+    Result is an array of the same size as the input array with one
+    answer record per line.  Returns None on error.
+
+    >>> batch_geocode_addresses(
+    ...     [
+    ...         '4600 Silver Hill Rd, Washington, DC, 20233',
+    ...         '935 Pennsylvania Avenue, NW, Washington, DC, 20535-0001',
+    ...         '1600 Pennsylvania Avenue NW, Washington, DC, 20500',
+    ...         '700 Pennsylvania Avenue NW, Washington, DC, 20408',
+    ...     ]
+    ... )
+    ['"1"," 4600 Silver Hill Rd,  Washington,  DC,  20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue,  NW,  Washington,  DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW,  Washington,  DC,  20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW,  Washington,  DC,  20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"']
+    """
+
+    # TODO: use list_utils.shard to break up the input if it's >10k records
+    # b/c the census website has a hard limit at 10k.
+
+    url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
+    payload = {'benchmark': '4', 'vintage': '4'}
+    raw_file = ''
+    for n, address in enumerate(addresses):
+        raw_file += f'{n+1}, {address}\n'
+    files = {'addressFile': ('input.csv', raw_file)}
+    logger.debug('POST: %s', url)
+    try:
+        r = requests.post(url, files=files, data=payload)
+    except Exception as e:
+        logger.exception(e)
+        return None
+    if r.status_code != 200:
+        print(r.text)
+        logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
+        return None
+
+    out = []
+    for line in r.text.split('\n'):
+        line = line.strip()
+        if len(line) > 0:
+            out.append(line)
     return out