From 6a1a8fee146a06c3e2b79f7fdfbf306d061ba78d Mon Sep 17 00:00:00 2001 From: Jennifer Melot Date: Wed, 26 Jun 2024 15:08:50 -0400 Subject: [PATCH] Prepare data for ror --- datasets/ror_data.sql | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 datasets/ror_data.sql diff --git a/datasets/ror_data.sql b/datasets/ror_data.sql new file mode 100644 index 0000000..71a7a20 --- /dev/null +++ b/datasets/ror_data.sql @@ -0,0 +1,91 @@ +-- Aggregate organizations with publications but no ror id for manual checking and then submission to ROR +CREATE OR REPLACE TABLE + staging_ai_companies_visualization.ror_data AS +WITH + clean_aliases AS ( + SELECT + cset_id, + ARRAY_TO_STRING(ARRAY_AGG(alias.alias), ";") AS name_variants + FROM + ai_companies_visualization.all_visualization_data + CROSS JOIN + UNNEST(aliases) AS alias + WHERE + LOWER(name) != LOWER(alias.alias) + GROUP BY + cset_id ), + grids AS ( + SELECT + COALESCE(legacy_cset_id, 4000+new_cset_id) AS cset_id, + ARRAY_TO_STRING(ARRAY_AGG(external_id), ";") AS grid_id + FROM + parat_input.ids + INNER JOIN + parat_input.organizations + USING + (new_cset_id) + WHERE + SOURCE = "GRID" + GROUP BY + cset_id ) +SELECT + all_visualization_data.name, + "active" AS status, + NULL AS other_language, + name_variants, + NULL AS acronym, + all_visualization_data.website, + NULL AS publications, +IF + (CONTAINS_SUBSTR(description_source, "wikipedia"), description_link, NULL) AS wiki_page, + NULL AS wiki_id, + NULL AS isni_id, + grid_id, + NULL AS crossref_id, + "Company" AS org_type, + NULL AS year_est, + NULL AS parent, + NULL AS child, + NULL AS related, + city, + standard_name AS country, + NULL AS comments +FROM + ai_companies_visualization.all_visualization_data +LEFT JOIN + clean_aliases +USING + (cset_id) +LEFT JOIN + grids +USING + (cset_id) +LEFT JOIN + parat_input.organizations +ON + cset_id = COALESCE(legacy_cset_id, 4000+new_cset_id) +LEFT JOIN ( + SELECT + DISTINCT standard_name, + raw_alpha_3 + FROM + countries.country_code) AS country_mapping +ON + all_visualization_data.country = raw_alpha_3 +WHERE + (all_pubs > 5) + AND (ARRAY_LENGTH(ror_id) = 0) + AND (all_visualization_data.name NOT IN ( + SELECT + SPLIT(name, " (")[0] + FROM + gcp_cset_ror.ror)) + AND (all_visualization_data.name NOT IN ( + SELECT + SPLIT(alias, " (")[0] + FROM + gcp_cset_ror.ror + CROSS JOIN + UNNEST(aliases) AS alias)) +ORDER BY + all_pubs desc \ No newline at end of file