Skip to content

Commit

Permalink
Merge pull request #393 from georgetown-cset/375-no-roll-ups
Browse files Browse the repository at this point in the history
Remove rollups; fix name mapping
  • Loading branch information
brianlove authored Jun 4, 2024
2 parents ca5108a + f7fe3bc commit 7dc05e9
Show file tree
Hide file tree
Showing 14 changed files with 38 additions and 66 deletions.
8 changes: 3 additions & 5 deletions company_linkage/parat_scripts/aggregate_organizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
from collections import defaultdict
import subprocess

# List of companies not being aggregated
# note: check https://docs.google.com/spreadsheets/d/1Tq28O8qIA6T3AJ5oTHKCcscaNZsY_E4OPOUm6JaiwWA/edit#gid=0
# to ensure list is complete
# we might switch this to a query or something instead of hard-coding it but this is easier for now
no_roll_up = [550, 1826, 313, 327, 2343]
# List of company ids that should not be rolled up. At the moment, we've decided to not include anything here, but I'm
# going to leave the logic in place in case we change our minds later.
no_roll_up = []


class Organization:
Expand Down
3 changes: 2 additions & 1 deletion company_linkage/sequences/initial_data.csv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
high_resolution_entities,organizations
high_resolution_entities,organizations
high_resolution_entities,original_company_names
29 changes: 1 addition & 28 deletions company_linkage/sql/original_company_names.sql
Original file line number Diff line number Diff line change
@@ -1,28 +1 @@
with names as (
select
best_name as name
from
cset_entity_resolution_internal.bloomberg_vendors_resolution
union all
select
canon_employer as name
from
cset_entity_resolution_internal.burning_glass_jobs_resolution
union all
select
name
from
cset_entity_resolution_internal.high_resolution_companies_preannotation
union all
select
name
from
cset_entity_resolution_internal.initial_bgov_companies_preannotation
union all
select
name
from
cset_entity_resolution_internal.sandp500_preannotation
)

select distinct(name) as name from names
select distinct(name) from high_resolution_entities.organizations
8 changes: 4 additions & 4 deletions web/gui-v2/src/components/ListView.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ describe("ListView", () => {
);

// Filter by China and verify that the count updates
expect(screen.getByText('Viewing 635 companies')).toBeVisible();
expect(screen.getByText('Viewing 632 companies')).toBeVisible();
const regionHeader = screen.getByRole('columnheader', { name: /country/i });
await user.click(getByRole(regionHeader, 'button', { name: /open/i }));
const menu = screen.getByRole('listbox');
await user.click(getByText(menu, 'China'));
expect(screen.getByText('Viewing 43 of 635 companies')).toBeVisible();
expect(screen.getByText('Viewing 43 of 632 companies')).toBeVisible();

// Reset the filters and verify that the count updates
await user.click(screen.getByRole('button', { name: /reset filters/i }));
expect(screen.getByText('Viewing 635 companies')).toBeVisible();
expect(screen.getByText('Viewing 632 companies')).toBeVisible();
}, 20000);


Expand All @@ -49,7 +49,7 @@ describe("ListView", () => {
await user.click(getByRole(companyHeader, 'combobox'));
const menu = screen.getByRole('listbox');
await user.click(getByText(menu, 'S&P 500'));
expect(screen.getByText('Viewing 445 of 635 companies')).toBeVisible();
expect(screen.getByText('Viewing 442 of 632 companies')).toBeVisible();
}, 20000);


Expand Down
2 changes: 1 addition & 1 deletion web/gui-v2/src/data/companies.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/data.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/overall_data.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/raw_data/sectors.jsonl

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions web/scripts/retrieve_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def get_continent(country: str) -> str:
def clean_company_name(name: str, lowercase_to_orig_cname: dict) -> str:
"""
Clean the company name. First try to find it in the map containing one-off mappings, then try to find it
in the mapping of lowercase to original company names, and if both those fail, title case it
in the mapping of lowercase to original company names, and if both those fail, return the original name
:param name: lowercased company name
:param lowercase_to_orig_cname: dict mapping lowercase to original-cased company names
:return: cleaned company name
Expand All @@ -448,9 +448,9 @@ def clean_company_name(name: str, lowercase_to_orig_cname: dict) -> str:
clean_name = name.strip()
if clean_name in COMPANY_NAME_MAP:
return COMPANY_NAME_MAP[clean_name]
if clean_name in lowercase_to_orig_cname:
return lowercase_to_orig_cname[clean_name]
return clean_name.title()
if clean_name.lower() in lowercase_to_orig_cname:
return lowercase_to_orig_cname[clean_name.lower()]
return clean_name


def clean_aliases(aliases: list, lowercase_to_orig_cname: dict, orig_name: str = None) -> str:
Expand Down
24 changes: 12 additions & 12 deletions web/tests/test_data/alphabet_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,51 @@
"aliases": [
{
"alias_language": "en",
"alias": "alphabet inc"
"alias": "Alphabet Inc"
}
],
"parent": [],
"children": [
{
"child_name": "intrinsic",
"child_name": "Intrinsic",
"child_id": 3163
},
{
"child_name": "waymo",
"child_name": "Waymo",
"child_id": 762
},
{
"child_name": "vicarious ai",
"child_name": "Vicarious Ai",
"child_id": 750
},
{
"child_name": "verily life sciences",
"child_name": "Verily Life Sciences",
"child_id": 745
}
],
"non_agg_children": [
{
"child_name": "google brain",
"child_name": "Google Brain",
"child_id": 473
},
{
"child_name": "deepmind",
"child_name": "Deepmind",
"child_id": 414
},
{
"child_name": "google",
"child_name": "Google",
"child_id": 101
},
{
"child_name": "fitbit",
"child_name": "Fitbit",
"child_id": 451
},
{
"child_name": "nest",
"child_name": "Nest",
"child_id": 594
},
{
"child_name": "google robotics",
"child_name": "Google Robotics",
"child_id": 474
}
],
Expand Down Expand Up @@ -5805,6 +5805,6 @@
"Search_Methods_pats": 2
}
],
"name": "alphabet",
"name": "Alphabet",
"patent_name": "alphabet"
}
4 changes: 2 additions & 2 deletions web/tests/test_data/alphabet_output.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
"https://www.linkedin.com/company/verily"
],
"stage": "Mature",
"name": "Alphabet",
"name": "Alphabet (including Google)",
"patent_name": "alphabet",
"continent": "North America",
"local_logo": "alphabet.png",
"aliases": "Alphabet Inc",
"aliases": "Alphabet; Alphabet Inc",
"permid": "5030853586",
"permid_links": [
{
Expand Down
6 changes: 3 additions & 3 deletions web/tests/test_data/hugging_face_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
"aliases": [
{
"alias_language": "en",
"alias": "hugging face, inc."
"alias": "Hugging Face, Inc."
},
{
"alias_language": "en",
"alias": "hugging face inc"
"alias": "Hugging Face Inc"
}
],
"parent": [],
Expand Down Expand Up @@ -854,6 +854,6 @@
"Search_Methods_pats": null
}
],
"name": "hugging face",
"name": "Hugging Face",
"patent_name": "hugging face"
}
2 changes: 1 addition & 1 deletion web/tests/test_data/hugging_face_output.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"patent_name": "hugging face",
"continent": "North America",
"local_logo": "hugging_face.png",
"aliases": "Hugging Face Inc; Hugging Face, Inc",
"aliases": "Hugging Face; Hugging Face Inc; Hugging Face, Inc",
"permid": "5063742076",
"permid_links": [
{
Expand Down
4 changes: 2 additions & 2 deletions web/tests/test_retrieve_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ def test_clean_company_name(self):
self.assertEqual(clean_company_name("captricity", {}), "Vidado")
self.assertEqual(clean_company_name("创新奇智", {}), "AInnovation")
self.assertEqual(clean_company_name("ibm", {"ibm": "IBM"}), "IBM")
self.assertEqual(clean_company_name("test", {}), "Test")
self.assertEqual(clean_company_name("test", {}), "test")

def test_clean_aliases(self):
aliases = [{"alias": "foo"}, {"alias": "bar"}, {"alias": "baz"}]
lowercase_to_orig_cname = {
"foo": "FoO",
"bar": "BAR",
}
self.assertEqual("BAR; Baz; FoO; Fred", clean_aliases(aliases, lowercase_to_orig_cname, "Fred"))
self.assertEqual("BAR; FoO; Fred; baz", clean_aliases(aliases, lowercase_to_orig_cname, "Fred"))

def test_get_permid_links(self):
permid = "1"
Expand Down

0 comments on commit 7dc05e9

Please sign in to comment.