Shard large geocode inputs.
authorScott Gasch <[email protected]>
Sun, 1 May 2022 00:34:34 +0000 (17:34 -0700)
committerScott Gasch <[email protected]>
Sun, 1 May 2022 00:34:34 +0000 (17:34 -0700)
geocode.py

index c2f5ead9bfa88576374d9abb5167eaf8b349683e..176487ae566458093ecf0fd95a3d21ab98221418 100644 (file)
@@ -13,6 +13,8 @@ from typing import Any, Dict, List, Optional
 import requests
 from requests.utils import requote_uri
 
+import list_utils
+
 logger = logging.getLogger(__name__)
 
 
@@ -54,6 +56,9 @@ def batch_geocode_addresses(addresses: List[str]):
     Result is an array of the same size as the input array with one
     answer record per line.  Returns None on error.
 
+    This code will deal with requests >10k addresses by chunking them
+    internally because the census website disallows requests > 10k lines.
+
     >>> batch_geocode_addresses(
     ...     [
     ...         '4600 Silver Hill Rd, Washington, DC, 20233',
@@ -65,31 +70,30 @@ def batch_geocode_addresses(addresses: List[str]):
     ['"1"," 4600 Silver Hill Rd,  Washington,  DC,  20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue,  NW,  Washington,  DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW,  Washington,  DC,  20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW,  Washington,  DC,  20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"']
     """
 
-    # TODO: use list_utils.shard to break up the input if it's >10k records
-    # b/c the census website has a hard limit at 10k.
-
+    n = 1
     url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
     payload = {'benchmark': '4', 'vintage': '4'}
-    raw_file = ''
-    for n, address in enumerate(addresses):
-        raw_file += f'{n+1}, {address}\n'
-    files = {'addressFile': ('input.csv', raw_file)}
-    logger.debug('POST: %s', url)
-    try:
-        r = requests.post(url, files=files, data=payload)
-    except Exception as e:
-        logger.exception(e)
-        return None
-    if r.status_code != 200:
-        print(r.text)
-        logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
-        return None
-
     out = []
-    for line in r.text.split('\n'):
-        line = line.strip()
-        if len(line) > 0:
-            out.append(line)
+    for chunk in list_utils.shard(addresses, 9999):
+        raw_file = ''
+        for address in chunk:
+            raw_file += f'{n}, {address}\n'
+            n += 1
+        files = {'addressFile': ('input.csv', raw_file)}
+        logger.debug('POST: %s', url)
+        try:
+            r = requests.post(url, files=files, data=payload)
+        except Exception as e:
+            logger.exception(e)
+            return None
+        if r.status_code != 200:
+            print(r.text)
+            logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
+            return None
+        for line in r.text.split('\n'):
+            line = line.strip()
+            if len(line) > 0:
+                out.append(line)
     return out