another.im-ios/scripts/itu_pdf_to_objc.py

#!/usr/bin/env python3
import requests
import io
from pypdf import PdfReader
import re
import logging

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] %(name)s {%(threadName)s} %(filename)s:%(lineno)d: %(message)s")
logger = logging.getLogger(__name__)

class Quicksy_Country:
    def __init__(self, alpha2mapping, name, alpha2, code, pattern):
        self.alpha2mapping = alpha2mapping
        self.name = name
        self.alpha2 = alpha2
        self.code = code
        self.pattern = pattern
    
    def __repr__(self):
        # map ITU country names to wikidata names
        itu2wikidata = {
            "Ireland": "Republic of Ireland",
            "China": "People's Republic of China",
            "Taiwan, China": "Taiwan",
            "Hong Kong, China": "Hong Kong",
            "Gambia": "The Gambia",
            "Falkland Islands (Malvinas)": "Falkland Islands",
            "Dominican Rep.": "Dominican Republic",
            "Dem. Rep. of the Congo": "Democratic Republic of the Congo",
            "Congo": "Republic of the Congo",
            "Czech Rep.": "Czech Republic",
            "Dem. People's Rep. of Korea": "North Korea",
            "Central African Rep.": "Central African Republic",
            "Bolivia (Plurinational State of)": "Bolivia",
            "Bahamas": "The Bahamas",
            "Korea (Rep. of)": "South Korea",
            "Iran (Islamic Republic of)": "Iran",
            "Lao P.D.R.": "Laos",
            "Moldova (Republic of)": "Moldova",
            "Micronesia": "Federated States of Micronesia",
            "Netherlands": "Kingdom of the Netherlands",
            "Russian Federation": "Russia",
            "Syrian Arab Republic": "Syria",
            "The Former Yugoslav Republic of Macedonia": "North Macedonia",
            "United States": "United States of America",
            "Vatican": "Vatican City",
            "Venezuela (Bolivarian Republic of)": "Venezuela",
            "Viet Nam": "Vietnam",
            "Swaziland": "Eswatini",
            "Sint Maarten (Dutch part)": "Sint Maarten",
            "Brunei Darussalam": "Brunei",
            "Bonaire, Sint Eustatius and Saba": "Caribbean Netherlands",
            "Côte d'Ivoire": "Ivory Coast",
            "Sao Tome and Principe": "São Tomé and Príncipe",
            "Timor-Leste": "East Timor",
            "Northern Marianas": "Northern Mariana Islands",
        }
        country = self.name
        if country in itu2wikidata:
            country = itu2wikidata[country]
        
        # map ITU country names to wikidata names and return swift code with alpha-2 country code instead of localizable name
        if country in alpha2mapping:
            return f"[[Quicksy_Country alloc] initWithName:nil alpha2:@\"{alpha2mapping[country]}\" code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"
        # return swift code with localizable name for every country we don't know the alpha-2 code for
        return f"[[Quicksy_Country alloc] initWithName:NSLocalizedString(@\"{self.name}\", @\"quicksy country\") alpha2:nil code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"

def parse_pdf(pdf_data, alpha2mapping):
    logger.info("Parsing PDF...")
    country_regex = re.compile(r'^(?P<country>[^0-9]+)[ ]{32}(?P<code>[0-9]+)[ ]{32}(?P<international_prefix>.+)[ ]{32}(?P<national_prefix>.+)[ ]{32}(?P<format>.+ digits)[ ]{32}(?P<end>.*)$')
    country_end_regex = re.compile(r'^(?P<dst>.*)([ ]{32}(?P<notes>.+))?$')
    countries = {}
    pdf = PdfReader(io.BytesIO(pdf_data))
    pagenum = 0
    last_entry = None
    for page in pdf.pages:
        pagenum += 1
        countries[pagenum] = []
        logger.info(f"Starting to analyze page {pagenum}...")
        text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
        if text and "Country/geographical area" in text and "Country" in text and "International" in text and "National" in text and "National (Significant)" in text and "UTC/DST" in text and "Note" in text:
            for line in text.split("\n"):
                #this is faster than having a "{128,} in the compiled country_regex
                match = country_regex.match(re.sub("[ ]{128,}", " "*32, line))
                if match == None:
                    # check if this is just a linebreak in the country name and append the value to the previous country
                    if re.sub("[ ]{128,}", " "*32, line) == line.strip() and last_entry != None and "Annex to ITU" not in line:
                        logger.debug(f"Adding to last country name: {line=}")
                        countries[pagenum][last_entry].name += f" {line.strip()}"
                    else:
                        last_entry = None           # don't append line continuations of non-real countries to a real country
                else:
                    match = match.groupdict() | {"dst": None, "notes": None}
                    if match["end"] and match["end"].strip() != "":
                        end_splitting = match["end"].split(" "*32)
                        if len(end_splitting) >= 1:
                            match["dst"] = end_splitting[0]
                        if len(end_splitting) >= 2:
                            match["notes"] = end_splitting[1]
                    match = {key: (value.strip() if value != None else None) for key, value in match.items()}
                    # logger.debug("****************")
                    # logger.debug(f"{match['country'] = }")
                    # logger.debug(f"{match['code'] = }")
                    # logger.debug(f"{match['international_prefix'] = }")
                    # logger.debug(f"{match['national_prefix'] = }")
                    # logger.debug(f"{match['format'] = }")
                    # logger.debug(f"{match['dst'] = }")
                    # logger.debug(f"{match['notes'] = }")
                    
                    if match["dst"] == None:        # all real countries have a dst entry
                        last_entry = None           # don't append line continuations of non-real countries to a real country
                    else:
                        country_code = f"+{match['code']}"
                        pattern = subpattern_matchers(match['format'], True)
                        superpattern = matcher(pattern, r"(\([0-9/]+\))[ ]*\+[ ]*(.+)[ ]+digits", match['format'], lambda result: result)
                        if pattern == None and superpattern != None:
                            #logger.debug(f"Trying superpattern: '{match['format']}' --> '{superpattern.group(1)}' ## '{superpattern.group(2)}'")
                            subpattern = subpattern_matchers(superpattern.group(2), False)
                            if subpattern != None:
                                pattern = re.sub("/", "|", superpattern.group(1)) + subpattern
                        if pattern == None:
                            logger.warning(f"Unknown format description for {match['country']} ({country_code}): '{match['format']}'")
                            pattern = "[0-9]+"
                        country = Quicksy_Country(alpha2mapping, match['country'], None, country_code, f"^{pattern}$")
                        countries[pagenum].append(country)
                        last_entry = len(countries[pagenum]) - 1
                        logger.info(f"Page {pagenum}: Found {len(countries[pagenum])} countries so far...")
    
    logger.info(f"Parsing finished: Extracted {sum([len(cs) for cs in countries.values()])} countries...")
    return [c for cs in countries.values() for c in cs]

def matcher(previous_result, regex, text, closure):
    if previous_result != None:
        return previous_result
    matches = re.match(regex, text)
    if matches == None:
        return None
    else:
        return closure(matches)

def subpattern_matchers(text, should_end_with_unit):
    if should_end_with_unit:
        if text[-6:] != "digits":
            logger.error(f"should_end_with_unit set but not ending in 'digits': {text[-6:] = }")
            return None
        text = text[:-6]
    
    def subdef(result):
        retval = f"[0-9]{{"
        grp1 = result.group(1) if result.group(1) != "up" else "1"
        retval += f"{grp1}"
        if result.group(3) != None:
            retval += f",{result.group(3)}"
        retval += f"}}"
        return retval
    pattern = []
    parts = [x.strip() for x in text.split(",")]
    for part in parts:
        result = matcher(None, r"(up|[0-9]+)([ ]*to[ ]*([0-9]+)[ ]*)?", part, subdef)
        #logger.debug(f"{part=} --> {result=}")
        if result != None:
            pattern.append(result)
    if len(pattern) == 0:
        return None
    return "(" + "|".join(pattern) + ")"

def get_sparql_results(query):
    import sys
    from SPARQLWrapper import SPARQLWrapper, JSON
    user_agent = "monal-im itu pdf parser/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


logger.info("Downloading Wikidata country names to ISO 3166-1 alpha-2 codes mapping...")
results = get_sparql_results("""SELECT ?country ?countryLabel ?code WHERE {
	?country wdt:P297 ?code .
	SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}""")
alpha2mapping = {result["countryLabel"]["value"]: result["code"]["value"] for result in results["results"]["bindings"]}

logger.info("Downloading PDF...")
response = requests.get("https://www.itu.int/dms_pub/itu-t/opb/sp/T-SP-E.164C-2011-PDF-E.pdf")
countries = parse_pdf(response.content, alpha2mapping)

# output complete swift code
print("""// This file was automatically generated by scripts/itu_pdf_to_objc.py
// Please run this python script again to update this file
// Example ../scripts/itu_pdf_to_objc.py >Classes/HelperTools+Quicksy_CountryCodes.m

#import "Quicksy_Country.h"
#import "HelperTools.h"

NSArray* _Nonnull COUNTRY_CODES = @[];      //will be replaced by actual values in +load below

@implementation HelperTools (CountryCodes)

//see https://stackoverflow.com/a/13326633 and https://fek.io/blog/method-swizzling-in-obj-c-and-swift/
+(void) load
{
    if(self == HelperTools.self)
    {
        static dispatch_once_t onceToken;
        dispatch_once(&onceToken, ^{
            COUNTRY_CODES = @[""")
for country in countries:
    print(f"                {country}")
print("""            ];
        });
    }
}

@end""")
copy monal src 2024-11-18 14:53:52 +00:00			`#!/usr/bin/env python3`
			`import requests`
			`import io`
			`from pypdf import PdfReader`
			`import re`
			`import logging`

			`logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] %(name)s {%(threadName)s} %(filename)s:%(lineno)d: %(message)s")`
			`logger = logging.getLogger(__name__)`

			`class Quicksy_Country:`
			`def __init__(self, alpha2mapping, name, alpha2, code, pattern):`
			`self.alpha2mapping = alpha2mapping`
			`self.name = name`
			`self.alpha2 = alpha2`
			`self.code = code`
			`self.pattern = pattern`

			`def __repr__(self):`
			`# map ITU country names to wikidata names`
			`itu2wikidata = {`
			`"Ireland": "Republic of Ireland",`
			`"China": "People's Republic of China",`
			`"Taiwan, China": "Taiwan",`
			`"Hong Kong, China": "Hong Kong",`
			`"Gambia": "The Gambia",`
			`"Falkland Islands (Malvinas)": "Falkland Islands",`
			`"Dominican Rep.": "Dominican Republic",`
			`"Dem. Rep. of the Congo": "Democratic Republic of the Congo",`
			`"Congo": "Republic of the Congo",`
			`"Czech Rep.": "Czech Republic",`
			`"Dem. People's Rep. of Korea": "North Korea",`
			`"Central African Rep.": "Central African Republic",`
			`"Bolivia (Plurinational State of)": "Bolivia",`
			`"Bahamas": "The Bahamas",`
			`"Korea (Rep. of)": "South Korea",`
			`"Iran (Islamic Republic of)": "Iran",`
			`"Lao P.D.R.": "Laos",`
			`"Moldova (Republic of)": "Moldova",`
			`"Micronesia": "Federated States of Micronesia",`
			`"Netherlands": "Kingdom of the Netherlands",`
			`"Russian Federation": "Russia",`
			`"Syrian Arab Republic": "Syria",`
			`"The Former Yugoslav Republic of Macedonia": "North Macedonia",`
			`"United States": "United States of America",`
			`"Vatican": "Vatican City",`
			`"Venezuela (Bolivarian Republic of)": "Venezuela",`
			`"Viet Nam": "Vietnam",`
			`"Swaziland": "Eswatini",`
			`"Sint Maarten (Dutch part)": "Sint Maarten",`
			`"Brunei Darussalam": "Brunei",`
			`"Bonaire, Sint Eustatius and Saba": "Caribbean Netherlands",`
			`"Côte d'Ivoire": "Ivory Coast",`
			`"Sao Tome and Principe": "São Tomé and Príncipe",`
			`"Timor-Leste": "East Timor",`
			`"Northern Marianas": "Northern Mariana Islands",`
			`}`
			`country = self.name`
			`if country in itu2wikidata:`
			`country = itu2wikidata[country]`

			`# map ITU country names to wikidata names and return swift code with alpha-2 country code instead of localizable name`
			`if country in alpha2mapping:`
			`return f"[[Quicksy_Country alloc] initWithName:nil alpha2:@\"{alpha2mapping[country]}\" code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"`
			`# return swift code with localizable name for every country we don't know the alpha-2 code for`
			`return f"[[Quicksy_Country alloc] initWithName:NSLocalizedString(@\"{self.name}\", @\"quicksy country\") alpha2:nil code:@\"{self.code}\" pattern:@\"{self.pattern}\"],"`

			`def parse_pdf(pdf_data, alpha2mapping):`
			`logger.info("Parsing PDF...")`
			`country_regex = re.compile(r'^(?P<country>[^0-9]+)[ ]{32}(?P<code>[0-9]+)[ ]{32}(?P<international_prefix>.+)[ ]{32}(?P<national_prefix>.+)[ ]{32}(?P<format>.+ digits)[ ]{32}(?P<end>.*)$')`
			`country_end_regex = re.compile(r'^(?P<dst>.*)([ ]{32}(?P<notes>.+))?$')`
			`countries = {}`
			`pdf = PdfReader(io.BytesIO(pdf_data))`
			`pagenum = 0`
			`last_entry = None`
			`for page in pdf.pages:`
			`pagenum += 1`
			`countries[pagenum] = []`
			`logger.info(f"Starting to analyze page {pagenum}...")`
			`text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)`
			`if text and "Country/geographical area" in text and "Country" in text and "International" in text and "National" in text and "National (Significant)" in text and "UTC/DST" in text and "Note" in text:`
			`for line in text.split("\n"):`
			`#this is faster than having a "{128,} in the compiled country_regex`
			`match = country_regex.match(re.sub("[ ]{128,}", " "*32, line))`
			`if match == None:`
			`# check if this is just a linebreak in the country name and append the value to the previous country`
			`if re.sub("[ ]{128,}", " "*32, line) == line.strip() and last_entry != None and "Annex to ITU" not in line:`
			`logger.debug(f"Adding to last country name: {line=}")`
			`countries[pagenum][last_entry].name += f" {line.strip()}"`
			`else:`
			`last_entry = None # don't append line continuations of non-real countries to a real country`
			`else:`
			`match = match.groupdict() \| {"dst": None, "notes": None}`
			`if match["end"] and match["end"].strip() != "":`
			`end_splitting = match["end"].split(" "*32)`
			`if len(end_splitting) >= 1:`
			`match["dst"] = end_splitting[0]`
			`if len(end_splitting) >= 2:`
			`match["notes"] = end_splitting[1]`
			`match = {key: (value.strip() if value != None else None) for key, value in match.items()}`
			`# logger.debug("****************")`
			`# logger.debug(f"{match['country'] = }")`
			`# logger.debug(f"{match['code'] = }")`
			`# logger.debug(f"{match['international_prefix'] = }")`
			`# logger.debug(f"{match['national_prefix'] = }")`
			`# logger.debug(f"{match['format'] = }")`
			`# logger.debug(f"{match['dst'] = }")`
			`# logger.debug(f"{match['notes'] = }")`

			`if match["dst"] == None: # all real countries have a dst entry`
			`last_entry = None # don't append line continuations of non-real countries to a real country`
			`else:`
			`country_code = f"+{match['code']}"`
			`pattern = subpattern_matchers(match['format'], True)`
			`superpattern = matcher(pattern, r"(\([0-9/]+\))[ ]\+[ ](.+)[ ]+digits", match['format'], lambda result: result)`
			`if pattern == None and superpattern != None:`
			`#logger.debug(f"Trying superpattern: '{match['format']}' --> '{superpattern.group(1)}' ## '{superpattern.group(2)}'")`
			`subpattern = subpattern_matchers(superpattern.group(2), False)`
			`if subpattern != None:`
			`pattern = re.sub("/", "\|", superpattern.group(1)) + subpattern`
			`if pattern == None:`
			`logger.warning(f"Unknown format description for {match['country']} ({country_code}): '{match['format']}'")`
			`pattern = "[0-9]+"`
			`country = Quicksy_Country(alpha2mapping, match['country'], None, country_code, f"^{pattern}$")`
			`countries[pagenum].append(country)`
			`last_entry = len(countries[pagenum]) - 1`
			`logger.info(f"Page {pagenum}: Found {len(countries[pagenum])} countries so far...")`

			`logger.info(f"Parsing finished: Extracted {sum([len(cs) for cs in countries.values()])} countries...")`
			`return [c for cs in countries.values() for c in cs]`

			`def matcher(previous_result, regex, text, closure):`
			`if previous_result != None:`
			`return previous_result`
			`matches = re.match(regex, text)`
			`if matches == None:`
			`return None`
			`else:`
			`return closure(matches)`

			`def subpattern_matchers(text, should_end_with_unit):`
			`if should_end_with_unit:`
			`if text[-6:] != "digits":`
			`logger.error(f"should_end_with_unit set but not ending in 'digits': {text[-6:] = }")`
			`return None`
			`text = text[:-6]`

			`def subdef(result):`
			`retval = f"[0-9]{{"`
			`grp1 = result.group(1) if result.group(1) != "up" else "1"`
			`retval += f"{grp1}"`
			`if result.group(3) != None:`
			`retval += f",{result.group(3)}"`
			`retval += f"}}"`
			`return retval`
			`pattern = []`
			`parts = [x.strip() for x in text.split(",")]`
			`for part in parts:`
			`result = matcher(None, r"(up\|[0-9]+)([ ]to[ ]([0-9]+)[ ]*)?", part, subdef)`
			`#logger.debug(f"{part=} --> {result=}")`
			`if result != None:`
			`pattern.append(result)`
			`if len(pattern) == 0:`
			`return None`
			`return "(" + "\|".join(pattern) + ")"`

			`def get_sparql_results(query):`
			`import sys`
			`from SPARQLWrapper import SPARQLWrapper, JSON`
			`user_agent = "monal-im itu pdf parser/%s.%s" % (sys.version_info[0], sys.version_info[1])`
			`sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)`
			`sparql.setQuery(query)`
			`sparql.setReturnFormat(JSON)`
			`return sparql.query().convert()`


			`logger.info("Downloading Wikidata country names to ISO 3166-1 alpha-2 codes mapping...")`
			`results = get_sparql_results("""SELECT ?country ?countryLabel ?code WHERE {`
			`?country wdt:P297 ?code .`
			`SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }`
			`}""")`
			`alpha2mapping = {result["countryLabel"]["value"]: result["code"]["value"] for result in results["results"]["bindings"]}`

			`logger.info("Downloading PDF...")`
			`response = requests.get("https://www.itu.int/dms_pub/itu-t/opb/sp/T-SP-E.164C-2011-PDF-E.pdf")`
			`countries = parse_pdf(response.content, alpha2mapping)`

			`# output complete swift code`
			`print("""// This file was automatically generated by scripts/itu_pdf_to_objc.py`
			`// Please run this python script again to update this file`
			`// Example ../scripts/itu_pdf_to_objc.py >Classes/HelperTools+Quicksy_CountryCodes.m`

			`#import "Quicksy_Country.h"`
			`#import "HelperTools.h"`

			`NSArray* _Nonnull COUNTRY_CODES = @[]; //will be replaced by actual values in +load below`

			`@implementation HelperTools (CountryCodes)`

			`//see https://stackoverflow.com/a/13326633 and https://fek.io/blog/method-swizzling-in-obj-c-and-swift/`
			`+(void) load`
			`{`
			`if(self == HelperTools.self)`
			`{`
			`static dispatch_once_t onceToken;`
			`dispatch_once(&onceToken, ^{`
			`COUNTRY_CODES = @[""")`
			`for country in countries:`
			`print(f" {country}")`
			`print(""" ];`
			`});`
			`}`
			`}`

			`@end""")`