From cfb7b80d1429e1d90823b7548e96c54ae62140e7 Mon Sep 17 00:00:00 2001
From: Mike <rans@email.com>
Date: Tue, 23 Jul 2024 18:34:53 +1200
Subject: [PATCH] HDXDSYS-898 Refactor org code (#134)

* Add uncleaned names as keys to lookups
Remove intersectoral check
dict() to {}
Move branches that have continue higher up in loops in operational presence

* Some rearrangement of operational presence and comments to help me understand the org processing

* Small reorg of org type if code

* store normalised keys as well as non normalised

* Use already normalised key

* Simplify org lookup code and operational presence

* Add to lookup to reduce need to keep normalising

* Pass around normalise variables

* Remove org lookup only used in test

* Remove unnecessary variable

* Update CHANGELOG

* Can just return value here

* Combine ifs

* Use named tuples for clarity

* Rename value to org_info

* Use org_data in populate_multiple

* Use OrgData in tests

* Make OrgInfo into a data class
Add used and complete bools to OrgInfo
Correct OrgInfo objects in org_map with corrections from looking up in data member variable

* Make separate function

* Add debug option to command line
Add org_map debug
---
 CHANGELOG.md                                  |   7 +
 pyproject.toml                                |   2 +-
 requirements.txt                              |  10 +-
 src/hapi/pipelines/app/__main__.py            |  12 +
 src/hapi/pipelines/app/pipelines.py           |   5 +-
 src/hapi/pipelines/database/conflict_event.py |   2 +-
 .../database/operational_presence.py          | 181 ++++++--------
 src/hapi/pipelines/database/org.py            | 196 ++++++++++-----
 src/hapi/pipelines/database/org_type.py       |   5 +-
 src/hapi/pipelines/database/sector.py         |   9 +-
 src/hapi/pipelines/utilities/mappings.py      |   7 +-
 tests/test_main.py                            | 228 +++++++++++++++++-
 12 files changed, 476 insertions(+), 188 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b1e69e1..bfc5eb82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [0.9.36] - 2024-07-19
+
+### Changed
+
+- Refactor org code
+- Also add uncleaned names as keys to lookups
+
 ## [0.9.35] - 2024-07-18
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index ab13da26..40240592 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ requires-python = ">=3.8"
 dependencies = [
     "hapi-schema>=0.8.14",
     "hdx-python-api>= 6.3.1",
-    "hdx-python-country>= 3.7.6",
+    "hdx-python-country>= 3.7.7",
     "hdx-python-database[postgresql]>= 1.3.1",
     "hdx-python-scraper>= 2.4.0",
     "hdx-python-utilities>= 3.7.2",
diff --git a/requirements.txt b/requirements.txt
index 4dd27bba..09904c26 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ click==8.1.7
     # via typer
 coverage==7.6.0
     # via pytest-cov
-cryptography==42.0.8
+cryptography==43.0.0
     # via pyopenssl
 defopt==6.4.0
     # via hdx-python-api
@@ -64,7 +64,7 @@ hdx-python-api==6.3.1
     # via
     #   hapi-pipelines (pyproject.toml)
     #   hdx-python-scraper
-hdx-python-country==3.7.6
+hdx-python-country==3.7.7
     # via
     #   hapi-pipelines (pyproject.toml)
     #   hdx-python-api
@@ -172,13 +172,13 @@ pydantic-core==2.20.1
     # via pydantic
 pygments==2.18.0
     # via rich
-pyopenssl==24.1.0
+pyopenssl==24.2.1
     # via
     #   hdx-python-api
     #   ndg-httpsclient
 pyphonetics==0.5.3
     # via hdx-python-country
-pytest==8.2.2
+pytest==8.3.1
     # via
     #   hapi-pipelines (pyproject.toml)
     #   pytest-check
@@ -242,7 +242,7 @@ ruamel-yaml==0.18.6
     # via hdx-python-utilities
 ruamel-yaml-clib==0.2.8
     # via ruamel-yaml
-setuptools==70.3.0
+setuptools==71.1.0
     # via ckanapi
 shellingham==1.5.4
     # via typer
diff --git a/src/hapi/pipelines/app/__main__.py b/src/hapi/pipelines/app/__main__.py
index e04cc5d7..87c0b7b5 100755
--- a/src/hapi/pipelines/app/__main__.py
+++ b/src/hapi/pipelines/app/__main__.py
@@ -84,6 +84,13 @@ def parse_args():
         action="store_true",
         help="Use saved data",
     )
+    parser.add_argument(
+        "-dbg",
+        "--debug",
+        default=False,
+        action="store_true",
+        help="Debug",
+    )
     return parser.parse_args()
 
 
@@ -95,6 +102,7 @@ def main(
     basic_auths: Optional[Dict[str, str]] = None,
     save: bool = False,
     use_saved: bool = False,
+    debug: bool = False,
     **ignore,
 ) -> None:
     """Run HAPI. Either a database connection string (db_uri) or database
@@ -110,6 +118,7 @@ def main(
         basic_auths (Optional[Dict[str, str]]): Basic authorisations
         save (bool): Whether to save state for testing. Defaults to False.
         use_saved (bool): Whether to use saved state for testing. Defaults to False.
+        debug (bool): Whether to output debug info. Defaults to False.
 
     Returns:
         None
@@ -156,6 +165,8 @@ def main(
                 )
                 pipelines.run()
                 pipelines.output()
+                if debug:
+                    pipelines.debug("debug")
     logger.info("HAPI pipelines completed!")
 
 
@@ -233,4 +244,5 @@ def main(
         basic_auths=basic_auths,
         save=args.save,
         use_saved=args.use_saved,
+        debug=args.debug,
     )
diff --git a/src/hapi/pipelines/app/pipelines.py b/src/hapi/pipelines/app/pipelines.py
index b0499ce2..10576ad8 100644
--- a/src/hapi/pipelines/app/pipelines.py
+++ b/src/hapi/pipelines/app/pipelines.py
@@ -96,7 +96,7 @@ def __init__(
             errors_on_exit=errors_on_exit,
             scrapers_to_run=scrapers_to_run,
         )
-        self.configurable_scrapers = dict()
+        self.configurable_scrapers = {}
         self.create_configurable_scrapers()
         self.metadata = Metadata(
             runner=self.runner, session=session, today=today
@@ -333,3 +333,6 @@ def output(self):
             self.wfp_commodity.populate()
             self.wfp_market.populate()
             self.food_price.populate()
+
+    def debug(self, folder: str) -> None:
+        self.org.output_org_map(folder)
diff --git a/src/hapi/pipelines/database/conflict_event.py b/src/hapi/pipelines/database/conflict_event.py
index 351561a3..e0b8919f 100644
--- a/src/hapi/pipelines/database/conflict_event.py
+++ b/src/hapi/pipelines/database/conflict_event.py
@@ -103,7 +103,7 @@ def populate(self):
             batch_populate(conflict_event_rows, self._session, DBConflictEvent)
 
         for dataset, msg in self._config.get(
-            "conflict_event_error_messages", dict()
+            "conflict_event_error_messages", {}
         ).items():
             add_message(errors, dataset, msg)
         for error in sorted(errors):
diff --git a/src/hapi/pipelines/database/operational_presence.py b/src/hapi/pipelines/database/operational_presence.py
index 1314d0ad..2b90c648 100644
--- a/src/hapi/pipelines/database/operational_presence.py
+++ b/src/hapi/pipelines/database/operational_presence.py
@@ -1,12 +1,10 @@
 """Functions specific to the operational presence theme."""
 
 from logging import getLogger
-from os.path import join
-from typing import Dict
+from typing import Dict, Optional, Set
 
 from hapi_schema.db_operational_presence import DBOperationalPresence
 from hdx.location.adminlevel import AdminLevel
-from hdx.utilities.dictandlist import write_list_to_csv
 from hdx.utilities.text import normalise
 from sqlalchemy.orm import Session
 
@@ -15,7 +13,7 @@
 from . import admins
 from .base_uploader import BaseUploader
 from .metadata import Metadata
-from .org import Org
+from .org import Org, OrgInfo
 from .org_type import OrgType
 from .sector import Sector
 
@@ -47,11 +45,39 @@ def __init__(
         self._results = results
         self._config = config
 
-    def populate(self, debug=False):
+    def complete_org_info(
+        self,
+        org_info: OrgInfo,
+        org_acronym: Optional[str],
+        org_type_name: Optional[str],
+        errors: Set[str],
+        dataset_name: str,
+    ):
+        if org_info.acronym is None and org_acronym is not None:
+            if len(org_acronym) > 32:
+                org_acronym = org_acronym[:32]
+            org_info.acronym = org_acronym
+            org_info.normalised_acronym = normalise(org_acronym)
+
+        # * Org type processing
+        if org_info.type_code is None and org_type_name is not None:
+            org_type_code = self._org_type.get_org_type_code(org_type_name)
+            if org_type_code:
+                org_info.type_code = org_type_code
+            else:
+                add_missing_value_message(
+                    errors,
+                    dataset_name,
+                    "org type",
+                    org_type_name,
+                )
+
+        # * Org matching
+        self._org.add_or_match_org(org_info)
+
+    def populate(self):
         logger.info("Populating operational presence table")
         operational_presence_rows = []
-        if debug:
-            debug_rows = []
         errors = set()
         for dataset in self._results.values():
             dataset_name = dataset["hdx_stub"]
@@ -59,8 +85,6 @@ def populate(self, debug=False):
             time_period_end = dataset["time_period"]["end"]
             number_duplicates = 0
             for admin_level, admin_results in dataset["results"].items():
-                resource_id = admin_results["hapi_resource_metadata"]["hdx_id"]
-                hxl_tags = admin_results["headers"][1]
                 values = admin_results["values"]
                 # Add this check to see if there is no data, otherwise get a confusing
                 # sqlalchemy error
@@ -70,6 +94,17 @@ def populate(self, debug=False):
                         f" {dataset_name} has no data, "
                         f"please check configuration"
                     )
+                hxl_tags = admin_results["headers"][1]
+                # If config is missing sector, add to error messages
+                try:
+                    sector_index = hxl_tags.index("#sector")
+                except ValueError:
+                    add_message(
+                        errors,
+                        dataset_name,
+                        "missing sector in config, dataset skipped",
+                    )
+                    continue
                 # Config must contain an org name
                 org_name_index = hxl_tags.index("#org+name")
                 # If config is missing org acronym, use the org name
@@ -82,118 +117,68 @@ def populate(self, debug=False):
                     org_type_name_index = hxl_tags.index("#org+type+name")
                 except ValueError:
                     org_type_name_index = None
-                # If config is missing sector, add to error messages
-                try:
-                    sector_index = hxl_tags.index("#sector")
-                except ValueError:
-                    add_message(
-                        errors,
-                        dataset_name,
-                        "missing sector in config, dataset skipped",
-                    )
-                    continue
+                resource_id = admin_results["hapi_resource_metadata"]["hdx_id"]
                 for admin_code, org_names in values[org_name_index].items():
-                    for i, org_name_orig in enumerate(org_names):
-                        admin2_code = admins.get_admin2_code_based_on_level(
-                            admin_code=admin_code, admin_level=admin_level
-                        )
-                        org_acronym_orig = values[org_acronym_index][
-                            admin_code
-                        ][i]
-                        if not org_name_orig:
-                            org_name_orig = org_acronym_orig
+                    for i, org_str in enumerate(org_names):
+                        # * Sector processing
                         sector_orig = values[sector_index][admin_code][i]
                         # Skip rows that are missing a sector
                         if not sector_orig:
                             add_message(
                                 errors,
                                 dataset_name,
-                                f"org {org_name_orig} missing sector",
+                                f"org {org_str} missing sector",
                             )
                             continue
-                        org_type_orig = None
-                        if org_type_name_index:
-                            org_type_orig = values[org_type_name_index][
-                                admin_code
-                            ][i]
-                        country_code = admin_code
+                        sector_code = self._sector.get_sector_code(sector_orig)
+                        if not sector_code:
+                            add_missing_value_message(
+                                errors, dataset_name, "sector", sector_orig
+                            )
+                            continue
+
+                        # * Admin processing
                         if admin_level == "admintwo":
                             country_code = self._admintwo.pcode_to_iso3.get(
                                 admin_code
                             )
-                        if admin_level == "adminone":
+                        elif admin_level == "adminone":
                             country_code = self._adminone.pcode_to_iso3.get(
                                 admin_code
                             )
-                        org_info = self._org.get_org_info(
-                            org_name_orig, location=country_code
+                        else:
+                            country_code = admin_code
+                        admin2_code = admins.get_admin2_code_based_on_level(
+                            admin_code=admin_code, admin_level=admin_level
                         )
-                        org_name = org_info.get("#org+name")
-                        self._org.add_org_to_lookup(org_name_orig, org_name)
-                        org_acronym = org_info.get(
-                            "#org+acronym",
-                            values[org_acronym_index][admin_code][i],
+                        admin2_ref = self._admins.admin2_data[admin2_code]
+
+                        # * Org processing
+                        if not org_str:
+                            org_str = values[org_acronym_index][admin_code][i]
+                        org_info = self._org.get_org_info(
+                            org_str, location=country_code
                         )
-                        if org_acronym is not None and len(org_acronym) > 32:
-                            org_acronym = org_acronym[:32]
-                        org_type_code = org_info.get("#org+type+code")
-                        org_type_name = None
-                        if not org_type_code:
+                        if not org_info.complete:
                             if org_type_name_index:
                                 org_type_name = values[org_type_name_index][
                                     admin_code
                                 ][i]
-                                if org_type_name:
-                                    org_type_code = (
-                                        self._org_type.get_org_type_code(
-                                            org_type_name
-                                        )
-                                    )
-                        if org_type_name and not org_type_code:
-                            add_missing_value_message(
-                                errors, dataset_name, "org type", org_type_name
-                            )
-                        self._org.add_or_match_org(
-                            acronym=org_acronym,
-                            org_name=org_name,
-                            org_type=org_type_code,
-                        )
-                        org_acronym, org_name, org_type = self._org.data[
-                            (
-                                normalise(org_acronym),
-                                normalise(org_name),
-                            )
-                        ]
-                        sector_code = self._sector.get_sector_code(sector_orig)
-                        if debug:
-                            debug_row = {
-                                "location": country_code,
-                                "org_name_orig": org_name_orig,
-                                "org_acronym_orig": org_acronym_orig,
-                                "org_type_orig": org_type_orig,
-                                "sector_orig": sector_orig,
-                                "org_name": org_name,
-                                "org_acronym": org_acronym,
-                                "org_type": org_type_code,
-                                "sector": sector_code,
-                            }
-                            if debug_row in debug_rows:
-                                continue
-                            debug_rows.append(debug_row)
-                            continue
-
-                        if not sector_code:
-                            add_missing_value_message(
-                                errors, dataset_name, "sector", sector_orig
+                            else:
+                                org_type_name = None
+                            self.complete_org_info(
+                                org_info,
+                                values[org_acronym_index][admin_code][i],
+                                org_type_name,
+                                errors,
+                                dataset_name,
                             )
-                            continue
 
-                        admin2_ref = self._admins.admin2_data[admin2_code]
                         operational_presence_row = dict(
                             resource_hdx_id=resource_id,
                             admin2_ref=admin2_ref,
-                            org_acronym=org_acronym,
-                            org_name=org_name,
+                            org_acronym=org_info.acronym,
+                            org_name=org_info.canonical_name,
                             sector_code=sector_code,
                             reference_period_start=time_period_start,
                             reference_period_end=time_period_end,
@@ -213,12 +198,6 @@ def populate(self, debug=False):
                     dataset_name,
                     f"{number_duplicates} duplicate rows found",
                 )
-        if debug:
-            write_list_to_csv(
-                join("saved_data", "debug_operational_presence.csv"),
-                debug_rows,
-            )
-            return
 
         logger.info("Writing to org table")
         self._org.populate_multiple()
@@ -228,7 +207,7 @@ def populate(self, debug=False):
         )
 
         for dataset, msg in self._config.get(
-            "operational_presence_error_messages", dict()
+            "operational_presence_error_messages", {}
         ).items():
             add_message(errors, dataset, msg)
         for error in sorted(errors):
diff --git a/src/hapi/pipelines/database/org.py b/src/hapi/pipelines/database/org.py
index 91bd7697..fab74d6c 100644
--- a/src/hapi/pipelines/database/org.py
+++ b/src/hapi/pipelines/database/org.py
@@ -1,11 +1,13 @@
 """Populate the org table."""
 
 import logging
-from typing import Dict
+from dataclasses import dataclass
+from os.path import join
+from typing import Dict, NamedTuple
 
 from hapi_schema.db_org import DBOrg
 from hdx.scraper.utilities.reader import Read
-from hdx.utilities.dictandlist import dict_of_sets_add
+from hdx.utilities.dictandlist import write_list_to_csv
 from hdx.utilities.text import normalise
 from sqlalchemy.orm import Session
 
@@ -17,6 +19,23 @@
 _BATCH_SIZE = 1000
 
 
+@dataclass
+class OrgInfo:
+    canonical_name: str
+    normalised_name: str
+    acronym: str | None
+    normalised_acronym: str | None
+    type_code: str | None
+    used: bool = False
+    complete: bool = False
+
+
+class OrgData(NamedTuple):
+    acronym: str
+    name: str
+    type_code: str
+
+
 class Org(BaseUploader):
     def __init__(
         self,
@@ -27,7 +46,6 @@ def __init__(
         self._datasetinfo = datasetinfo
         self.data = {}
         self._org_map = {}
-        self._org_lookup = {}
 
     def populate(self):
         logger.info("Populating org mapping")
@@ -39,74 +57,128 @@ def populate(self):
             format="csv",
             file_prefix="org",
         )
-        for row in iterator:
-            org_name = row.get("#x_pattern")
-            canonical_org_name = row.get("#org+name")
-            if not canonical_org_name:
+
+        for i, row in enumerate(iterator):
+            canonical_name = row["#org+name"]
+            if not canonical_name:
+                logger.error(f"Canonical name is empty in row {i}!")
                 continue
-            self._org_map[org_name] = row
-            self._org_map[canonical_org_name] = row
-            org_acronym = row.get("#org+acronym")
-            if org_acronym:
-                self._org_map[org_acronym] = row
+            normalised_name = normalise(canonical_name)
+            country_code = row["#country+code"]
+            acronym = row["#org+acronym"]
+            if acronym:
+                normalised_acronym = normalise(acronym)
+            else:
+                normalised_acronym = None
+            org_name = row["#x_pattern"]
+            type_code = row["#org+type+code"]
+            org_info = OrgInfo(
+                canonical_name,
+                normalised_name,
+                acronym,
+                normalised_acronym,
+                type_code,
+            )
+            self._org_map[(country_code, canonical_name)] = org_info
+            self._org_map[(country_code, normalised_name)] = org_info
+            self._org_map[(country_code, acronym)] = org_info
+            self._org_map[(country_code, normalised_acronym)] = org_info
+            self._org_map[(country_code, org_name)] = org_info
+            self._org_map[(country_code, normalise(org_name))] = org_info
 
-    def add_or_match_org(
-        self,
-        acronym,
-        org_name,
-        org_type,
-    ):
-        key = (
-            normalise(acronym),
-            normalise(org_name),
+    def get_org_info(self, org_str: str, location: str) -> OrgInfo:
+        key = (location, org_str)
+        org_info = self._org_map.get(key)
+        if org_info:
+            return org_info
+        normalised_str = normalise(org_str)
+        org_info = self._org_map.get((location, normalised_str))
+        if org_info:
+            self._org_map[key] = org_info
+            return org_info
+        org_info = self._org_map.get((None, org_str))
+        if org_info:
+            self._org_map[key] = org_info
+            return org_info
+        org_info = self._org_map.get((None, normalised_str))
+        if org_info:
+            self._org_map[key] = org_info
+            return org_info
+        org_info = OrgInfo(
+            canonical_name=org_str,
+            normalised_name=normalised_str,
+            acronym=None,
+            normalised_acronym=None,
+            type_code=None,
         )
-        if key in self.data:
-            org_type_old = self.data[key][2]
-            if org_type and not org_type_old:
-                self.data[key][2] = org_type
-            # TODO: should we flag orgs if we find more than one org type?
-            return
-        self.data[
-            (
-                normalise(acronym),
-                normalise(org_name),
+        self._org_map[key] = org_info
+        return org_info
+
+    def add_or_match_org(self, org_info: OrgInfo) -> OrgData:
+        key = (org_info.normalised_acronym, org_info.normalised_name)
+        org_data = self.data.get(key)
+        if org_data:
+            if not org_data.type_code and org_info.type_code:
+                org_data = OrgData(
+                    org_data.acronym, org_data.name, org_info.type_code
+                )
+                self.data[key] = org_data
+                # TODO: should we flag orgs if we find more than one org type?
+            else:
+                org_info.type_code = org_data.type_code
+            # Since we're looking up by normalised acronym and normalised name,
+            # these don't need copying here
+            org_info.acronym = org_data.acronym
+            org_info.canonical_name = org_data.name
+
+        else:
+            org_data = OrgData(
+                org_info.acronym, org_info.canonical_name, org_info.type_code
             )
-        ] = [acronym, org_name, org_type]
+            self.data[key] = org_data
+        if org_info.acronym and org_info.type_code:
+            org_info.complete = True
+        org_info.used = True
+        return org_data
 
     def populate_multiple(self):
         org_rows = [
             dict(
-                acronym=values[0],
-                name=values[1],
-                org_type_code=values[2],
+                acronym=org_data.acronym,
+                name=org_data.name,
+                org_type_code=org_data.type_code,
             )
-            for values in self.data.values()
+            for org_data in self.data.values()
         ]
         batch_populate(org_rows, self._session, DBOrg)
 
-    def get_org_info(self, org_name: str, location: str) -> Dict[str, str]:
-        org_name_map = {
-            on: self._org_map[on]
-            for on in self._org_map
-            if self._org_map[on]["#country+code"] in [location, None]
-        }
-        org_map_info = org_name_map.get(org_name)
-        if not org_map_info:
-            org_name_map_clean = {
-                normalise(on): org_name_map[on] for on in org_name_map
-            }
-            org_name_clean = normalise(org_name)
-            org_map_info = org_name_map_clean.get(org_name_clean)
-        if not org_map_info:
-            return {"#org+name": org_name}
-        org_info = {"#org+name": org_map_info["#org+name"]}
-        if not org_info["#org+name"]:
-            org_info["#org+name"] = org_map_info["#x_pattern"]
-        if org_map_info["#org+acronym"]:
-            org_info["#org+acronym"] = org_map_info["#org+acronym"]
-        if org_map_info["#org+type+code"]:
-            org_info["#org+type+code"] = org_map_info["#org+type+code"]
-        return org_info
-
-    def add_org_to_lookup(self, org_name_orig, org_name_official):
-        dict_of_sets_add(self._org_lookup, org_name_official, org_name_orig)
+    def output_org_map(self, folder: str) -> None:
+        rows = [
+            (
+                "Country Code",
+                "Lookup",
+                "Canonical Name",
+                "Normalised Name",
+                "Acronym",
+                "Normalised Acronym",
+                "Type Code",
+                "Used",
+                "Complete",
+            )
+        ]
+        for key, org_info in self._org_map.items():
+            country_code, lookup = key
+            rows.append(
+                (
+                    country_code,
+                    lookup,
+                    org_info.canonical_name,
+                    org_info.normalised_name,
+                    org_info.acronym,
+                    org_info.normalised_acronym,
+                    org_info.type_code,
+                    "Y" if org_info.used else "N",
+                    "Y" if org_info.complete else "N",
+                )
+            )
+        write_list_to_csv(join(folder, "org_map.csv"), rows)
diff --git a/src/hapi/pipelines/database/org_type.py b/src/hapi/pipelines/database/org_type.py
index 4d5a3609..cfa35a48 100644
--- a/src/hapi/pipelines/database/org_type.py
+++ b/src/hapi/pipelines/database/org_type.py
@@ -29,6 +29,9 @@ def populate(self):
         logger.info("Populating org type table")
 
         def parse_org_type_values(code: str, description: str) -> None:
+            self.data[code] = code
+            self.data[description] = code
+            self.data[normalise(code)] = code
             self.data[normalise(description)] = code
             org_type_row = DBOrgType(
                 code=code,
@@ -60,7 +63,7 @@ def parse_org_type_values(code: str, description: str) -> None:
 
         self._session.commit()
 
-    def get_org_type_code(self, org_type: str) -> str:
+    def get_org_type_code(self, org_type: str) -> str | None:
         return get_code_from_name(
             name=org_type,
             code_lookup=self.data,
diff --git a/src/hapi/pipelines/database/sector.py b/src/hapi/pipelines/database/sector.py
index 8113a2c5..8f85580d 100644
--- a/src/hapi/pipelines/database/sector.py
+++ b/src/hapi/pipelines/database/sector.py
@@ -29,9 +29,10 @@ def populate(self):
         logger.info("Populating sector table")
 
         def parse_sector_values(code: str, name: str):
-            if code != "intersectoral":
-                self.data[normalise(name)] = code
-                self.data[normalise(code)] = code
+            self.data[name] = code
+            self.data[code] = code
+            self.data[normalise(name)] = code
+            self.data[normalise(code)] = code
             sector_row = DBSector(
                 code=code,
                 name=name,
@@ -59,7 +60,7 @@ def parse_sector_values(code: str, name: str):
 
         self._session.commit()
 
-    def get_sector_code(self, sector: str) -> str:
+    def get_sector_code(self, sector: str) -> str | None:
         return get_code_from_name(
             name=sector,
             code_lookup=self.data,
diff --git a/src/hapi/pipelines/utilities/mappings.py b/src/hapi/pipelines/utilities/mappings.py
index 6739914d..fc11e609 100644
--- a/src/hapi/pipelines/utilities/mappings.py
+++ b/src/hapi/pipelines/utilities/mappings.py
@@ -20,7 +20,7 @@ def get_code_from_name(
         fuzzy_match (bool): Allow fuzzy matching or not
 
     Returns:
-        str or None: matching code
+        str or None: Matching code
     """
     code = code_lookup.get(name)
     if code:
@@ -28,6 +28,7 @@ def get_code_from_name(
     name_clean = normalise(name)
     code = code_lookup.get(name_clean)
     if code:
+        code_lookup[name] = code
         return code
     if len(name) <= MATCH_THRESHOLD:
         return None
@@ -41,8 +42,8 @@ def get_code_from_name(
     )
     if name_index is None:
         return None
-    name = names[name_index]
-    code = code_lookup.get(name)
+    code = code_lookup.get(names[name_index])
     if code:
+        code_lookup[name] = code
         code_lookup[name_clean] = code
     return code
diff --git a/tests/test_main.py b/tests/test_main.py
index 8e31681d..8ff9e7f2 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -37,6 +37,7 @@
 from hapi.pipelines.app import load_yamls
 from hapi.pipelines.app.__main__ import add_defaults
 from hapi.pipelines.app.pipelines import Pipelines
+from hapi.pipelines.database.org import OrgInfo
 
 logger = logging.getLogger(__name__)
 
@@ -122,6 +123,8 @@ def test_pipelines(self, configuration, folder):
                     pipelines.run()
                     logger.info("Writing to database")
                     pipelines.output()
+                    logger.info("Writing debug output")
+                    pipelines.debug(temp_folder)
 
                     count = session.scalar(
                         select(func.count(DBResource.hdx_id))
@@ -195,16 +198,223 @@ def test_pipelines(self, configuration, folder):
                         select(func.count(DBFoodPrice.resource_hdx_id))
                     )
                     check.equal(count, 31615)
-                    org_mapping = pipelines.org._org_lookup
-                    assert org_mapping[
-                        "International Organization for Migration"
-                    ] == {
-                        "Organisation Internationale pour les Migrations",
+                    org_map = pipelines.org._org_map
+                    iom = OrgInfo(
                         "International Organization for Migration",
+                        "international organization for migration",
                         "IOM",
-                    }
-                    assert org_mapping["United Nations Children's Fund"] == {
-                        "Fonds des Nations Unies pour l'Enfance",
+                        "iom",
+                        "447",
+                        True,
+                        True,
+                    )
+                    assert org_map[(None, "IOM")] == iom
+                    assert org_map[(None, "iom")] == iom
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "Organisation Internationale pour les Migrations",
+                            )
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "organisation internationale pour les migrations",
+                            )
+                        ]
+                        == iom
+                    )
+
+                    iom = OrgInfo(
+                        "International Organization for Migration",
+                        "international organization for migration",
+                        "IOM",
+                        "iom",
+                        "447",
+                        False,
+                        False,
+                    )
+                    assert (
+                        org_map[
+                            (None, "International Organisation for Migrations")
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (None, "international organisation for migrations")
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (None, "INTERNATIONALE ORGANISATION FOR MIGRATION")
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (None, "internationale organisation for migration")
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "Organisation Internationale des Migrations",
+                            )
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "organisation internationale des migrations",
+                            )
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "OIM - International Organization for Migration",
+                            )
+                        ]
+                        == iom
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "oim international organization for migration",
+                            )
+                        ]
+                        == iom
+                    )
+
+                    unicef = OrgInfo(
                         "United Nations Children's Fund",
+                        "united nations childrens fund",
                         "UNICEF",
-                    }
+                        "unicef",
+                        "447",
+                        True,
+                        True,
+                    )
+                    assert (
+                        org_map[(None, "United Nations Children's Fund")]
+                        == unicef
+                    )
+                    assert (
+                        org_map[(None, "united nations childrens fund")]
+                        == unicef
+                    )
+                    assert org_map[(None, "UNICEF")] == unicef
+                    assert org_map[(None, "unicef")] == unicef
+                    assert (
+                        org_map[
+                            (None, "Fonds des Nations Unies pour l'Enfance")
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (None, "fonds des nations unies pour lenfance")
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "UNICEF - Fondo de las Naciones Unidas para la Infancia",
+                            )
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "unicef fondo de las naciones unidas para la infancia",
+                            )
+                        ]
+                        == unicef
+                    )
+
+                    unicef = OrgInfo(
+                        "United Nations Children's Fund",
+                        "united nations childrens fund",
+                        "UNICEF",
+                        "unicef",
+                        "447",
+                        False,
+                        False,
+                    )
+                    assert (
+                        org_map[
+                            (None, "United Nations Children's Emergency Fund")
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (None, "united nations childrens emergency fund")
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (None, "Fond des Nations Unies pour l'Enfance")
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[(None, "fond des nations unies pour lenfance")]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "United Nations International Childrens Emergency Fund",
+                            )
+                        ]
+                        == unicef
+                    )
+                    assert (
+                        org_map[
+                            (
+                                None,
+                                "united nations international childrens emergency fund",
+                            )
+                        ]
+                        == unicef
+                    )
+
+                    assert org_map[("AFG", "WEWORLD")] == OrgInfo(
+                        "WEWORLD",
+                        "weworld",
+                        "WEWORLD",
+                        "weworld",
+                        None,
+                        True,
+                        False,
+                    )
+
+                    assert org_map[("NGA", "HECADF")] == OrgInfo(
+                        "HECADF",
+                        "hecadf",
+                        "HECADF",
+                        "hecadf",
+                        "441",
+                        True,
+                        True,
+                    )