Merge pull request #393 from georgetown-cset/375-no-roll-ups

Remove rollups; fix name mapping
georgetown-cset · Jun 4, 2024 · 7dc05e9 · 7dc05e9
2 parents ca5108a + f7fe3bc
commit 7dc05e9
Show file tree

Hide file tree

Showing 14 changed files with 38 additions and 66 deletions.
diff --git a/company_linkage/parat_scripts/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py
@@ -4,11 +4,9 @@
 from collections import defaultdict
 import subprocess
 
-# List of companies not being aggregated
-# note: check https://docs.google.com/spreadsheets/d/1Tq28O8qIA6T3AJ5oTHKCcscaNZsY_E4OPOUm6JaiwWA/edit#gid=0
-# to ensure list is complete
-# we might switch this to a query or something instead of hard-coding it but this is easier for now
-no_roll_up = [550, 1826, 313, 327, 2343]
+# List of company ids that should not be rolled up. At the moment, we've decided to not include anything here, but I'm
+# going to leave the logic in place in case we change our minds later.
+no_roll_up = []
 
 
 class Organization:

diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv
@@ -1 +1,2 @@
-high_resolution_entities,organizations
+high_resolution_entities,organizations
+high_resolution_entities,original_company_names
diff --git a/company_linkage/sql/original_company_names.sql b/company_linkage/sql/original_company_names.sql
@@ -1,28 +1 @@
-with names as (
-select
-  best_name as name
-from
-  cset_entity_resolution_internal.bloomberg_vendors_resolution
-union all
-select
-  canon_employer as name
-from
-  cset_entity_resolution_internal.burning_glass_jobs_resolution
-union all
-select
-  name
-from
-  cset_entity_resolution_internal.high_resolution_companies_preannotation
-union all
-select
-  name
-from
-  cset_entity_resolution_internal.initial_bgov_companies_preannotation
-union all
-select
-  name
-from
-  cset_entity_resolution_internal.sandp500_preannotation
-)
-
-select distinct(name) as name from names
+select distinct(name) from high_resolution_entities.organizations
diff --git a/web/gui-v2/src/components/ListView.test.js b/web/gui-v2/src/components/ListView.test.js
@@ -25,16 +25,16 @@ describe("ListView", () => {
     );
 
     // Filter by China and verify that the count updates
-    expect(screen.getByText('Viewing 635 companies')).toBeVisible();
+    expect(screen.getByText('Viewing 632 companies')).toBeVisible();
     const regionHeader = screen.getByRole('columnheader', { name: /country/i });
     await user.click(getByRole(regionHeader, 'button', { name: /open/i }));
     const menu = screen.getByRole('listbox');
     await user.click(getByText(menu, 'China'));
-    expect(screen.getByText('Viewing 43 of 635 companies')).toBeVisible();
+    expect(screen.getByText('Viewing 43 of 632 companies')).toBeVisible();
 
     // Reset the filters and verify that the count updates
     await user.click(screen.getByRole('button', { name: /reset filters/i }));
-    expect(screen.getByText('Viewing 635 companies')).toBeVisible();
+    expect(screen.getByText('Viewing 632 companies')).toBeVisible();
   }, 20000);
 
 
@@ -49,7 +49,7 @@ describe("ListView", () => {
     await user.click(getByRole(companyHeader, 'combobox'));
     const menu = screen.getByRole('listbox');
     await user.click(getByText(menu, 'S&P 500'));
-    expect(screen.getByText('Viewing 445 of 635 companies')).toBeVisible();
+    expect(screen.getByText('Viewing 442 of 632 companies')).toBeVisible();
   }, 20000);
 
 

diff --git a/web/gui-v2/src/data/companies.json b/web/gui-v2/src/data/companies.json
diff --git a/web/gui-v2/src/static_data/data.js b/web/gui-v2/src/static_data/data.js
diff --git a/web/gui-v2/src/static_data/overall_data.json b/web/gui-v2/src/static_data/overall_data.json
diff --git a/web/raw_data/sectors.jsonl b/web/raw_data/sectors.jsonl
diff --git a/web/scripts/retrieve_data.py b/web/scripts/retrieve_data.py
@@ -438,7 +438,7 @@ def get_continent(country: str) -> str:
 def clean_company_name(name: str, lowercase_to_orig_cname: dict) -> str:
     """
     Clean the company name. First try to find it in the map containing one-off mappings, then try to find it
-    in the mapping of lowercase to original company names, and if both those fail, title case it
+    in the mapping of lowercase to original company names, and if both those fail, return the original name
     :param name: lowercased company name
     :param lowercase_to_orig_cname: dict mapping lowercase to original-cased company names
     :return: cleaned company name
@@ -448,9 +448,9 @@ def clean_company_name(name: str, lowercase_to_orig_cname: dict) -> str:
     clean_name = name.strip()
     if clean_name in COMPANY_NAME_MAP:
         return COMPANY_NAME_MAP[clean_name]
-    if clean_name in lowercase_to_orig_cname:
-        return lowercase_to_orig_cname[clean_name]
-    return clean_name.title()
+    if clean_name.lower() in lowercase_to_orig_cname:
+        return lowercase_to_orig_cname[clean_name.lower()]
+    return clean_name
 
 
 def clean_aliases(aliases: list, lowercase_to_orig_cname: dict, orig_name: str = None) -> str:

diff --git a/web/tests/test_data/alphabet_input.json b/web/tests/test_data/alphabet_input.json
@@ -4,51 +4,51 @@
   "aliases": [
     {
       "alias_language": "en",
-      "alias": "alphabet inc"
+      "alias": "Alphabet Inc"
     }
   ],
   "parent": [],
   "children": [
     {
-      "child_name": "intrinsic",
+      "child_name": "Intrinsic",
       "child_id": 3163
     },
     {
-      "child_name": "waymo",
+      "child_name": "Waymo",
       "child_id": 762
     },
     {
-      "child_name": "vicarious ai",
+      "child_name": "Vicarious Ai",
       "child_id": 750
     },
     {
-      "child_name": "verily life sciences",
+      "child_name": "Verily Life Sciences",
       "child_id": 745
     }
   ],
   "non_agg_children": [
     {
-      "child_name": "google brain",
+      "child_name": "Google Brain",
       "child_id": 473
     },
     {
-      "child_name": "deepmind",
+      "child_name": "Deepmind",
       "child_id": 414
     },
     {
-      "child_name": "google",
+      "child_name": "Google",
       "child_id": 101
     },
     {
-      "child_name": "fitbit",
+      "child_name": "Fitbit",
       "child_id": 451
     },
     {
-      "child_name": "nest",
+      "child_name": "Nest",
       "child_id": 594
     },
     {
-      "child_name": "google robotics",
+      "child_name": "Google Robotics",
       "child_id": 474
     }
   ],
@@ -5805,6 +5805,6 @@
       "Search_Methods_pats": 2
     }
   ],
-  "name": "alphabet",
+  "name": "Alphabet",
   "patent_name": "alphabet"
 }
diff --git a/web/tests/test_data/alphabet_output.json b/web/tests/test_data/alphabet_output.json
@@ -36,11 +36,11 @@
     "https://www.linkedin.com/company/verily"
   ],
   "stage": "Mature",
-  "name": "Alphabet",
+  "name": "Alphabet (including Google)",
   "patent_name": "alphabet",
   "continent": "North America",
   "local_logo": "alphabet.png",
-  "aliases": "Alphabet Inc",
+  "aliases": "Alphabet; Alphabet Inc",
   "permid": "5030853586",
   "permid_links": [
     {

diff --git a/web/tests/test_data/hugging_face_input.json b/web/tests/test_data/hugging_face_input.json
@@ -4,11 +4,11 @@
   "aliases": [
     {
       "alias_language": "en",
-      "alias": "hugging face, inc."
+      "alias": "Hugging Face, Inc."
     },
     {
       "alias_language": "en",
-      "alias": "hugging face inc"
+      "alias": "Hugging Face Inc"
     }
   ],
   "parent": [],
@@ -854,6 +854,6 @@
       "Search_Methods_pats": null
     }
   ],
-  "name": "hugging face",
+  "name": "Hugging Face",
   "patent_name": "hugging face"
 }
diff --git a/web/tests/test_data/hugging_face_output.json b/web/tests/test_data/hugging_face_output.json
@@ -19,7 +19,7 @@
   "patent_name": "hugging face",
   "continent": "North America",
   "local_logo": "hugging_face.png",
-  "aliases": "Hugging Face Inc; Hugging Face, Inc",
+  "aliases": "Hugging Face; Hugging Face Inc; Hugging Face, Inc",
   "permid": "5063742076",
   "permid_links": [
     {

diff --git a/web/tests/test_retrieve_data.py b/web/tests/test_retrieve_data.py
@@ -123,15 +123,15 @@ def test_clean_company_name(self):
         self.assertEqual(clean_company_name("captricity", {}), "Vidado")
         self.assertEqual(clean_company_name("创新奇智", {}), "AInnovation")
         self.assertEqual(clean_company_name("ibm", {"ibm": "IBM"}), "IBM")
-        self.assertEqual(clean_company_name("test", {}), "Test")
+        self.assertEqual(clean_company_name("test", {}), "test")
 
     def test_clean_aliases(self):
         aliases = [{"alias": "foo"}, {"alias": "bar"}, {"alias": "baz"}]
         lowercase_to_orig_cname = {
             "foo": "FoO",
             "bar": "BAR",
         }
-        self.assertEqual("BAR; Baz; FoO; Fred", clean_aliases(aliases, lowercase_to_orig_cname, "Fred"))
+        self.assertEqual("BAR; FoO; Fred; baz", clean_aliases(aliases, lowercase_to_orig_cname, "Fred"))
 
     def test_get_permid_links(self):
         permid = "1"