216 lines
10 KiB
Python
216 lines
10 KiB
Python
|
#!/usr/bin/env python3
|
||
|
import requests
|
||
|
import io
|
||
|
from pypdf import PdfReader
|
||
|
import re
|
||
|
import logging
|
||
|
|
||
|
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] %(name)s {%(threadName)s} %(filename)s:%(lineno)d: %(message)s")
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
class Quicksy_Country:
|
||
|
def __init__(self, alpha2mapping, name, alpha2, code, pattern):
|
||
|
self.alpha2mapping = alpha2mapping
|
||
|
self.name = name
|
||
|
self.alpha2 = alpha2
|
||
|
self.code = code
|
||
|
self.pattern = pattern
|
||
|
|
||
|
def __repr__(self):
|
||
|
# map ITU country names to wikidata names
|
||
|
itu2wikidata = {
|
||
|
"Ireland": "Republic of Ireland",
|
||
|
"China": "People's Republic of China",
|
||
|
"Taiwan, China": "Taiwan",
|
||
|
"Hong Kong, China": "Hong Kong",
|
||
|
"Gambia": "The Gambia",
|
||
|
"Falkland Islands (Malvinas)": "Falkland Islands",
|
||
|
"Dominican Rep.": "Dominican Republic",
|
||
|
"Dem. Rep. of the Congo": "Democratic Republic of the Congo",
|
||
|
"Congo": "Republic of the Congo",
|
||
|
"Czech Rep.": "Czech Republic",
|
||
|
"Dem. People's Rep. of Korea": "North Korea",
|
||
|
"Central African Rep.": "Central African Republic",
|
||
|
"Bolivia (Plurinational State of)": "Bolivia",
|
||
|
"Bahamas": "The Bahamas",
|
||
|
"Korea (Rep. of)": "South Korea",
|
||
|
"Iran (Islamic Republic of)": "Iran",
|
||
|
"Lao P.D.R.": "Laos",
|
||
|
"Moldova (Republic of)": "Moldova",
|
||
|
"Micronesia": "Federated States of Micronesia",
|
||
|
"Netherlands": "Kingdom of the Netherlands",
|
||
|
"Russian Federation": "Russia",
|
||
|
"Syrian Arab Republic": "Syria",
|
||
|
"The Former Yugoslav Republic of Macedonia": "North Macedonia",
|
||
|
"United States": "United States of America",
|
||
|
"Vatican": "Vatican City",
|
||
|
"Venezuela (Bolivarian Republic of)": "Venezuela",
|
||
|
"Viet Nam": "Vietnam",
|
||
|
"Swaziland": "Eswatini",
|
||
|
"Sint Maarten (Dutch part)": "Sint Maarten",
|
||
|
"Brunei Darussalam": "Brunei",
|
||
|
"Bonaire, Sint Eustatius and Saba": "Caribbean Netherlands",
|
||
|
"Côte d'Ivoire": "Ivory Coast",
|
||
|
"Sao Tome and Principe": "São Tomé and Príncipe",
|
||
|
"Timor-Leste": "East Timor",
|
||
|
"Northern Marianas": "Northern Mariana Islands",
|
||
|
}
|
||
|
country = self.name
|
||
|
if country in itu2wikidata:
|
||
|
country = itu2wikidata[country]
|
||
|
|
||
|
# map ITU country names to wikidata names and return swift code with alpha-2 country code instead of localizable name
|
||
|
if country in alpha2mapping:
|
||
|
return f"[[Quicksy_Country alloc] initWithName:nil alpha2:@\"{alpha2mapping[country]}\" code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"
|
||
|
# return swift code with localizable name for every country we don't know the alpha-2 code for
|
||
|
return f"[[Quicksy_Country alloc] initWithName:NSLocalizedString(@\"{self.name}\", @\"quicksy country\") alpha2:nil code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"
|
||
|
|
||
|
def parse_pdf(pdf_data, alpha2mapping):
|
||
|
logger.info("Parsing PDF...")
|
||
|
country_regex = re.compile(r'^(?P<country>[^0-9]+)[ ]{32}(?P<code>[0-9]+)[ ]{32}(?P<international_prefix>.+)[ ]{32}(?P<national_prefix>.+)[ ]{32}(?P<format>.+ digits)[ ]{32}(?P<end>.*)$')
|
||
|
country_end_regex = re.compile(r'^(?P<dst>.*)([ ]{32}(?P<notes>.+))?$')
|
||
|
countries = {}
|
||
|
pdf = PdfReader(io.BytesIO(pdf_data))
|
||
|
pagenum = 0
|
||
|
last_entry = None
|
||
|
for page in pdf.pages:
|
||
|
pagenum += 1
|
||
|
countries[pagenum] = []
|
||
|
logger.info(f"Starting to analyze page {pagenum}...")
|
||
|
text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
|
||
|
if text and "Country/geographical area" in text and "Country" in text and "International" in text and "National" in text and "National (Significant)" in text and "UTC/DST" in text and "Note" in text:
|
||
|
for line in text.split("\n"):
|
||
|
#this is faster than having a "{128,} in the compiled country_regex
|
||
|
match = country_regex.match(re.sub("[ ]{128,}", " "*32, line))
|
||
|
if match == None:
|
||
|
# check if this is just a linebreak in the country name and append the value to the previous country
|
||
|
if re.sub("[ ]{128,}", " "*32, line) == line.strip() and last_entry != None and "Annex to ITU" not in line:
|
||
|
logger.debug(f"Adding to last country name: {line=}")
|
||
|
countries[pagenum][last_entry].name += f" {line.strip()}"
|
||
|
else:
|
||
|
last_entry = None # don't append line continuations of non-real countries to a real country
|
||
|
else:
|
||
|
match = match.groupdict() | {"dst": None, "notes": None}
|
||
|
if match["end"] and match["end"].strip() != "":
|
||
|
end_splitting = match["end"].split(" "*32)
|
||
|
if len(end_splitting) >= 1:
|
||
|
match["dst"] = end_splitting[0]
|
||
|
if len(end_splitting) >= 2:
|
||
|
match["notes"] = end_splitting[1]
|
||
|
match = {key: (value.strip() if value != None else None) for key, value in match.items()}
|
||
|
# logger.debug("****************")
|
||
|
# logger.debug(f"{match['country'] = }")
|
||
|
# logger.debug(f"{match['code'] = }")
|
||
|
# logger.debug(f"{match['international_prefix'] = }")
|
||
|
# logger.debug(f"{match['national_prefix'] = }")
|
||
|
# logger.debug(f"{match['format'] = }")
|
||
|
# logger.debug(f"{match['dst'] = }")
|
||
|
# logger.debug(f"{match['notes'] = }")
|
||
|
|
||
|
if match["dst"] == None: # all real countries have a dst entry
|
||
|
last_entry = None # don't append line continuations of non-real countries to a real country
|
||
|
else:
|
||
|
country_code = f"+{match['code']}"
|
||
|
pattern = subpattern_matchers(match['format'], True)
|
||
|
superpattern = matcher(pattern, r"(\([0-9/]+\))[ ]*\+[ ]*(.+)[ ]+digits", match['format'], lambda result: result)
|
||
|
if pattern == None and superpattern != None:
|
||
|
#logger.debug(f"Trying superpattern: '{match['format']}' --> '{superpattern.group(1)}' ## '{superpattern.group(2)}'")
|
||
|
subpattern = subpattern_matchers(superpattern.group(2), False)
|
||
|
if subpattern != None:
|
||
|
pattern = re.sub("/", "|", superpattern.group(1)) + subpattern
|
||
|
if pattern == None:
|
||
|
logger.warning(f"Unknown format description for {match['country']} ({country_code}): '{match['format']}'")
|
||
|
pattern = "[0-9]+"
|
||
|
country = Quicksy_Country(alpha2mapping, match['country'], None, country_code, f"^{pattern}$")
|
||
|
countries[pagenum].append(country)
|
||
|
last_entry = len(countries[pagenum]) - 1
|
||
|
logger.info(f"Page {pagenum}: Found {len(countries[pagenum])} countries so far...")
|
||
|
|
||
|
logger.info(f"Parsing finished: Extracted {sum([len(cs) for cs in countries.values()])} countries...")
|
||
|
return [c for cs in countries.values() for c in cs]
|
||
|
|
||
|
def matcher(previous_result, regex, text, closure):
|
||
|
if previous_result != None:
|
||
|
return previous_result
|
||
|
matches = re.match(regex, text)
|
||
|
if matches == None:
|
||
|
return None
|
||
|
else:
|
||
|
return closure(matches)
|
||
|
|
||
|
def subpattern_matchers(text, should_end_with_unit):
|
||
|
if should_end_with_unit:
|
||
|
if text[-6:] != "digits":
|
||
|
logger.error(f"should_end_with_unit set but not ending in 'digits': {text[-6:] = }")
|
||
|
return None
|
||
|
text = text[:-6]
|
||
|
|
||
|
def subdef(result):
|
||
|
retval = f"[0-9]{{"
|
||
|
grp1 = result.group(1) if result.group(1) != "up" else "1"
|
||
|
retval += f"{grp1}"
|
||
|
if result.group(3) != None:
|
||
|
retval += f",{result.group(3)}"
|
||
|
retval += f"}}"
|
||
|
return retval
|
||
|
pattern = []
|
||
|
parts = [x.strip() for x in text.split(",")]
|
||
|
for part in parts:
|
||
|
result = matcher(None, r"(up|[0-9]+)([ ]*to[ ]*([0-9]+)[ ]*)?", part, subdef)
|
||
|
#logger.debug(f"{part=} --> {result=}")
|
||
|
if result != None:
|
||
|
pattern.append(result)
|
||
|
if len(pattern) == 0:
|
||
|
return None
|
||
|
return "(" + "|".join(pattern) + ")"
|
||
|
|
||
|
def get_sparql_results(query):
|
||
|
import sys
|
||
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
||
|
user_agent = "monal-im itu pdf parser/%s.%s" % (sys.version_info[0], sys.version_info[1])
|
||
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
|
||
|
sparql.setQuery(query)
|
||
|
sparql.setReturnFormat(JSON)
|
||
|
return sparql.query().convert()
|
||
|
|
||
|
|
||
|
logger.info("Downloading Wikidata country names to ISO 3166-1 alpha-2 codes mapping...")
|
||
|
results = get_sparql_results("""SELECT ?country ?countryLabel ?code WHERE {
|
||
|
?country wdt:P297 ?code .
|
||
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
|
||
|
}""")
|
||
|
alpha2mapping = {result["countryLabel"]["value"]: result["code"]["value"] for result in results["results"]["bindings"]}
|
||
|
|
||
|
logger.info("Downloading PDF...")
|
||
|
response = requests.get("https://www.itu.int/dms_pub/itu-t/opb/sp/T-SP-E.164C-2011-PDF-E.pdf")
|
||
|
countries = parse_pdf(response.content, alpha2mapping)
|
||
|
|
||
|
# output complete swift code
|
||
|
print("""// This file was automatically generated by scripts/itu_pdf_to_objc.py
|
||
|
// Please run this python script again to update this file
|
||
|
// Example ../scripts/itu_pdf_to_objc.py >Classes/HelperTools+Quicksy_CountryCodes.m
|
||
|
|
||
|
#import "Quicksy_Country.h"
|
||
|
#import "HelperTools.h"
|
||
|
|
||
|
NSArray* _Nonnull COUNTRY_CODES = @[]; //will be replaced by actual values in +load below
|
||
|
|
||
|
@implementation HelperTools (CountryCodes)
|
||
|
|
||
|
//see https://stackoverflow.com/a/13326633 and https://fek.io/blog/method-swizzling-in-obj-c-and-swift/
|
||
|
+(void) load
|
||
|
{
|
||
|
if(self == HelperTools.self)
|
||
|
{
|
||
|
static dispatch_once_t onceToken;
|
||
|
dispatch_once(&onceToken, ^{
|
||
|
COUNTRY_CODES = @[""")
|
||
|
for country in countries:
|
||
|
print(f" {country}")
|
||
|
print(""" ];
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@end""")
|