Documentation & program updates

- dependencies file for pip packages - internal code cleanup - updated README
joergschultzelutter · Oct 6, 2021 · 07eb28a · 07eb28a
1 parent a3e0f79
commit 07eb28a
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.DS_Store
diff --git a/README.md b/README.md
@@ -15,9 +15,14 @@ Benötigte pip-pakete:
 - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
 - [requests](https://github.com/psf/requests)
 
+Installation aller Pakete via
+
+    pip install -r dependencies.txt
+
 ## Aufruf
 
-Programm einfach ohne Parameter aufrufen. Die Rufzeichenliste wird heruntergeladen, analysiert und anschließend zeilenweise auf dem Bildschirm ausgegeben.
+Programm einfach ohne Parameter aufrufen. Die Rufzeichenliste wird heruntergeladen, analysiert und anschließend zeilenweise auf dem Bildschirm ausgegeben. Die Datenstruktur ist identisch mit der in der pdf-Datei verwendeten Datenstruktur - d.h. Callsign, Klasse (A/E), Name und Anschrift(en)
 
 ## Bekannte Einschränkungen
-- Die PDF-Quelldatei besteht aus insgesamt drei Spalten. Ist die Seite nicht vollständig gefüllt (z.B. neue Kategorie beginnt auf der nächsten Seite), so wird im Extrakt die Reihenfolge der Einträge möglicherweise nicht korrekt in das CSV-Dateiformat überführt. Da die erzeugte CSV-Datei aber eh maschinell weiterverarbeitet werden wird, sollte dieser Punkt irrelevant sein.
+
+- Die PDF-Quelldatei besteht aus insgesamt drei Spalten. Ist eine Seite der pdf-Datei nicht vollständig gefüllt (z.B. weil auf der Folgeseite eine neue Dokument-Kategorie beginnt), so wird im Extrakt die Reihenfolge der Einträge möglicherweise nicht korrekt in das CSV-Dateiformat überführt. Da die erzeugte CSV-Datei aber eh maschinell weiterverarbeitet werden wird, sollte dieser Punkt irrelevant sein.
diff --git a/dependencies.txt b/dependencies.txt
@@ -0,0 +1,2 @@
+pdfminer.six>=20201018
+requests>=2.25.1
diff --git a/src/bnetza-parser.py b/src/bnetza-parser.py
@@ -42,31 +42,57 @@
 logger = logging.getLogger(__name__)
 
 
-# Herunterladen der Rufzeichenliste von der Bundesnetzagentur
 def get_rufzeichen_file(
     request_url: str = "https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Telekommunikation/Unternehmen_Institutionen/Frequenzen/Amateurfunk/Rufzeichenliste/Rufzeichenliste_AFU.pdf?__blob=publicationFile",
 ):
-
+    """
+    Imports the ICAO/IATA data from a local file. Creates dictionaries for
+    IATA-ICAO mapping and IATA-lat/lon mapping
+    Parameters
+    ==========
+    url : 'str'
+        Default value always points to the latest Bundesnetzagentur
+        publication file
+    Returns
+    =======
+    file_blob: 'bytes'
+        File content or 'None' if the file could not get downloaded
+    """
+
+    # http request header
     request_headers = {"User-Agent": "Mozilla"}
+
+    # our target variable
     file_blob = None
 
+    # try to download the file
+
+    logger.debug(msg=f"Trying to download file from {request_url}")
     try:
         resp = requests.get(url=request_url, headers=request_headers)
     except:
+        logger.error(msg="Unable to download file; cannot continue")
         resp = None
 
     if resp:
         if resp.status_code == 200:
             file_blob = resp.content
+            logger.info(msg="File successfully acquired")
     else:
-        logger.info(msg="Cannot download Rufzeichen file")
+        logger.error(
+            msg=f"Unable to download file; cannot continue. HTTP error {resp.status_code}"
+        )
 
     return file_blob
 
 
 if __name__ == "__main__":
+    # start the process by downloading the file
     file_content = get_rufzeichen_file()
+
+    # did we get a file? Then process it
     if file_content:
+        # This variable will later contain the text per pdf page
         output_string = StringIO()
 
         # Create the parser
@@ -79,10 +105,12 @@ def get_rufzeichen_file(
         if not document.is_extractable:
             raise PDFDocument.PDFTextExtractionNotAllowed
 
+        # Init our PDF processor
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
         interpreter = PDFPageInterpreter(rsrcmgr, device)
 
+        # regex for German call signs
         regex_string = r"^(D[A-D|F-R][0-9][A-Z]{1,3}),\s(A|E),"
 
         # loop over all pages in the document
@@ -109,9 +137,9 @@ def get_rufzeichen_file(
                         matches = re.search(regex_string, parser_value)
                         if matches:
                             # transpose the value to a unified csv format
-                            parser_value = parser_value.replace(", ", ";").replace(
-                                "; ", ";"
-                            )
+                            # fmt:off
+                            parser_value = parser_value.replace(", ", ";").replace("; ", ";")
+                            # fmt:on
                             # and print the value
                             print(parser_value)
                     # Accept the value and tell the parser that additional lines can be attached
@@ -126,9 +154,10 @@ def get_rufzeichen_file(
                         if parser_value != "":
                             matches = re.search(regex_string, parser_value)
                             if matches:
-                                parser_value = parser_value.replace(", ", ";").replace(
-                                    "; ", ";"
-                                )
+                                # fmt:off
+                                # transpose the value to a unified csv format
+                                parser_value = parser_value.replace(", ", ";").replace("; ", ";")
+                                # fmt:on
                                 print(parser_value)
                         parser_value = ""
                     # "Seite" --> end of page; we can start attaching again
@@ -142,7 +171,10 @@ def get_rufzeichen_file(
             if parser_value != "":
                 matches = re.search(regex_string, parser_value)
                 if matches:
+                    # transpose the value to a unified csv format
                     parser_value = parser_value.replace(", ", ";").replace("; ", ";")
                     print(parser_value)
+
+            # truncate the output string to zero length and then loop to the next doc page
             output_string.seek(0)
             output_string.truncate(0)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,5 @@ dmypy.json

		# Pyre type checker
		.pyre/

		.DS_Store