X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=geocode.py;h=e9e5c35c5fbec6a272518351c461ec5a4fed4243;hb=e46158e49121b8a955bb07b73f5bcf9928b79c90;hp=c2f5ead9bfa88576374d9abb5167eaf8b349683e;hpb=606e0d6372392183af23ab162755d8714361a417;p=python_utils.git diff --git a/geocode.py b/geocode.py index c2f5ead..e9e5c35 100644 --- a/geocode.py +++ b/geocode.py @@ -3,31 +3,56 @@ # © Copyright 2022, Scott Gasch """Wrapper around US Census address geocoder API described here: -https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf -https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf + +* https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf +* https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf + +Also try:: + + $ curl --form addressFile=@localfile.csv \ + --form benchmark=2020 \ + https://geocoding.geo.census.gov/geocoder/locations/addressbatch \ + --output geocoderesult.csv """ +import json import logging from typing import Any, Dict, List, Optional import requests from requests.utils import requote_uri +import list_utils + logger = logging.getLogger(__name__) def geocode_address(address: str) -> Optional[Dict[str, Any]]: - """Send a single address to the US Census geocoding API. The response - is a parsed JSON chunk of data with N addressMatches in the result - section and the details of each match within it. Returns None on error. + """Send a single address to the US Census geocoding API in order to + lookup relevant data about it (including, if possible, its + lat/long). The response is a parsed JSON chunk of data with N + addressMatches in the result section and the details of each match + within it. + + Args: + address: the full address to lookup in the form: "STREET + ADDRESS, CITY, STATE, ZIPCODE". These components may be + omitted and the service will make educated guesses but + the commas delimiting each component must be included. + + Returns: + A parsed json dict with a bunch of information about the + address contained within it. Each 'addressMatch' + in the JSON describes the details of a possible match. + Returns None if there was an error or the address is + not known. >>> json = geocode_address('4600 Silver Hill Rd,, 20233') >>> json['result']['addressMatches'][0]['matchedAddress'] '4600 SILVER HILL RD, WASHINGTON, DC, 20233' >>> json['result']['addressMatches'][0]['coordinates'] - {'x': -76.92743, 'y': 38.84599} - + {'x': -76.9274328556918, 'y': 38.845989080537514} """ url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress' url += f'?address={address}' @@ -41,18 +66,32 @@ def geocode_address(address: str) -> Optional[Dict[str, Any]]: return None if r.status_code != 200: + logger.debug(r.text) logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) return None - # print(json.dumps(r.json(), indent=4, sort_keys=True)) + logger.debug('Response: %s', json.dumps(r.json(), indent=4, sort_keys=True)) return r.json() -def batch_geocode_addresses(addresses: List[str]): - """Send up to addresses for batch geocoding. Each line of the input - list should be a single address of the form: STREET ADDRESS, CITY, - STATE, ZIP. Components may be omitted but the commas may not be. - Result is an array of the same size as the input array with one - answer record per line. Returns None on error. +def batch_geocode_addresses(addresses: List[str]) -> Optional[List[str]]: + """Send a list of addresses for batch geocoding to a web service + operated by the US Census Bureau. + + Args: + addresses: a list of addresses to geocode. Each line of the + input list should be a single address in the form: "STREET + ADDRESS, CITY, STATE, ZIPCODE". Individual address components + may be omitted and the service will make educated guesses but + the commas delimiters between address components may not be + omitted. + + Returns: + An array of the same size as the input array with one + answer record per line. Returns None on error. + + Note: this code will deal with requests >10k addresses by chunking + them internally because the census website disallows requests > + 10k lines. >>> batch_geocode_addresses( ... [ @@ -62,34 +101,35 @@ def batch_geocode_addresses(addresses: List[str]): ... '700 Pennsylvania Avenue NW, Washington, DC, 20408', ... ] ... ) - ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"'] + ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743285599994,38.84598908100003","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534009999998,38.89875363300007","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304089899997,38.89361872300003","76226346","L","11","001","980000","1025"'] """ - # TODO: use list_utils.shard to break up the input if it's >10k records - # b/c the census website has a hard limit at 10k. - + n = 1 url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch' payload = {'benchmark': '4', 'vintage': '4'} - raw_file = '' - for n, address in enumerate(addresses): - raw_file += f'{n+1}, {address}\n' - files = {'addressFile': ('input.csv', raw_file)} - logger.debug('POST: %s', url) - try: - r = requests.post(url, files=files, data=payload) - except Exception as e: - logger.exception(e) - return None - if r.status_code != 200: - print(r.text) - logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) - return None - out = [] - for line in r.text.split('\n'): - line = line.strip() - if len(line) > 0: - out.append(line) + for chunk in list_utils.shard(addresses, 9999): + raw_file = '' + for address in chunk: + raw_file += f'{n}, {address}\n' + n += 1 + files = {'addressFile': ('input.csv', raw_file)} + logger.debug('POST: %s', url) + try: + r = requests.post(url, files=files, data=payload) + except Exception as e: + logger.exception(e) + return None + + if r.status_code != 200: + logger.debug(r.text) + logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code) + return None + logger.debug('Response: %s', r.text) + for line in r.text.split('\n'): + line = line.strip() + if len(line) > 0: + out.append(line) return out