From 06ade55539e2e31fbd8bc708ab6f91953d9663f3 Mon Sep 17 00:00:00 2001 From: Scott Gasch Date: Sat, 30 Apr 2022 17:34:34 -0700 Subject: [PATCH] Shard large geocode inputs. --- geocode.py | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/geocode.py b/geocode.py index c2f5ead..176487a 100644 --- a/geocode.py +++ b/geocode.py @@ -13,6 +13,8 @@ from typing import Any, Dict, List, Optional import requests from requests.utils import requote_uri +import list_utils + logger = logging.getLogger(__name__) @@ -54,6 +56,9 @@ def batch_geocode_addresses(addresses: List[str]): Result is an array of the same size as the input array with one answer record per line. Returns None on error. + This code will deal with requests >10k addresses by chunking them + internally because the census website disallows requests > 10k lines. + >>> batch_geocode_addresses( ... [ ... '4600 Silver Hill Rd, Washington, DC, 20233', @@ -65,31 +70,30 @@ def batch_geocode_addresses(addresses: List[str]): ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"'] """ - # TODO: use list_utils.shard to break up the input if it's >10k records - # b/c the census website has a hard limit at 10k. - + n = 1 url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch' payload = {'benchmark': '4', 'vintage': '4'} - raw_file = '' - for n, address in enumerate(addresses): - raw_file += f'{n+1}, {address}\n' - files = {'addressFile': ('input.csv', raw_file)} - logger.debug('POST: %s', url) - try: - r = requests.post(url, files=files, data=payload) - except Exception as e: - logger.exception(e) - return None - if r.status_code != 200: - print(r.text) - logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) - return None - out = [] - for line in r.text.split('\n'): - line = line.strip() - if len(line) > 0: - out.append(line) + for chunk in list_utils.shard(addresses, 9999): + raw_file = '' + for address in chunk: + raw_file += f'{n}, {address}\n' + n += 1 + files = {'addressFile': ('input.csv', raw_file)} + logger.debug('POST: %s', url) + try: + r = requests.post(url, files=files, data=payload) + except Exception as e: + logger.exception(e) + return None + if r.status_code != 200: + print(r.text) + logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) + return None + for line in r.text.split('\n'): + line = line.strip() + if len(line) > 0: + out.append(line) return out -- 2.47.1