From cb5fc372464aaeedb9f91145232e5085fb34a0c5 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:49:09 -0400 Subject: [PATCH 01/23] update study_participants --- config/es_indices_ccdi_model.yml | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 5823613..ff190be 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -13,8 +13,17 @@ Indices: type: keyword sex_at_birth: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword sample_diagnosis_file_filters: type: nested properties: @@ -326,15 +335,24 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, sample_diagnosis_file_filter, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, st, stf, stp + WITH p, sample_diagnosis_file_filter, + COLLECT({COLLECT( + DISTINCT + CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END + ) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status ) as event_free_survival_status, + COLLECT(DISTINCT su.first_event ) as first_event) } + as survival_filters, st, stf, stp RETURN DISTINCT p.id as id, p.id as pid, p.participant_id as participant_id, apoc.text.split(p.race, ';') as race, p.sex_at_birth as sex_at_birth, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, From 3c5912a615d4488dac0a177383690791b17d6940 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:00:38 -0400 Subject: [PATCH 02/23] update study_participants --- config/es_indices_ccdi_model.yml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index ff190be..9d6f784 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -335,17 +335,20 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, sample_diagnosis_file_filter, - COLLECT({COLLECT( - DISTINCT - CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END - ) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status ) as event_free_survival_status, - COLLECT(DISTINCT su.first_event ) as first_event) } - as survival_filters, st, stf, stp +WITH p, sample_diagnosis_file_filter, + COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp +WITH p, sample_diagnosis_file_filter, + COLLECT({last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} )AS survival_filters, + st, stf, stp RETURN DISTINCT p.id as id, p.id as pid, From 1f2c0f376725ba9d82ad7da4682f183628fc768b Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:29:17 -0400 Subject: [PATCH 03/23] update study_participants two other queries --- config/es_indices_ccdi_model.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 9d6f784..8599ae7 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -373,7 +373,6 @@ WITH p, sample_diagnosis_file_filter, null as participant_id, null as race, null as sex_at_birth, - null as last_known_survival_status, COLLECT(DISTINCT { sample_anatomic_site: null, participant_age_at_collection: null, @@ -394,6 +393,12 @@ WITH p, sample_diagnosis_file_filter, library_source_molecule: null, library_strategy: null }) AS sample_diagnosis_file_filters, + COLLECT(DISTINCT { + last_known_survival_status: null, + age_at_event_free_survival_status: null, + event_free_survival_status: null, + first_event: null, + }) AS survival_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, @@ -414,7 +419,12 @@ WITH p, sample_diagnosis_file_filter, null as participant_id, null as race, null as sex_at_birth, - null as last_known_survival_status, + COLLECT(DISTINCT { + null as last_known_survival_status, + null as age_at_event_free_survival_status, + null as event_free_survival_status, + null as first_event, + }) AS survival_filters, COLLECT(DISTINCT { sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), participant_age_at_collection: sm.participant_age_at_collection, From e9bbfac9f7326955bc8af30be0d92d40b320764e Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:53:40 -0400 Subject: [PATCH 04/23] update study_participants two other queries --- config/es_indices_ccdi_model.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 8599ae7..be7a950 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -344,10 +344,10 @@ WITH p, sample_diagnosis_file_filter, COLLECT(DISTINCT su.first_event) as first_event, COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp WITH p, sample_diagnosis_file_filter, - COLLECT({last_known_survival_status: last_known_survival_status, + {last_known_survival_status: last_known_survival_status, event_free_survival_status: event_free_survival_status, first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} )AS survival_filters, + age_at_event_free_survival_status: age_at_event_free_survival_status} AS survival_filters, st, stf, stp RETURN DISTINCT p.id as id, From bc479df8d36c6623b810969decc92a1a4b73b9d7 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:42:42 -0400 Subject: [PATCH 05/23] update study_participants two other queries --- config/es_indices_ccdi_model.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index be7a950..9f03098 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -335,20 +335,18 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) -WITH p, sample_diagnosis_file_filter, - COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp -WITH p, sample_diagnosis_file_filter, - {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} AS survival_filters, - st, stf, stp + WITH p, sample_diagnosis_file_filter, st, stf, stp, + COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status + WITH p, sample_diagnosis_file_filter, st, stf, stp, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters RETURN DISTINCT p.id as id, p.id as pid, From 538a10a710a58b3d3b2d59d23152bed889bdde00 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:54:44 -0400 Subject: [PATCH 06/23] update participants query --- config/es_indices_ccdi_model.yml | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 9f03098..5faa995 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -480,8 +480,17 @@ Indices: type: keyword alternate_participant_id: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword sample_diagnosis_file_filters: type: nested properties: @@ -802,7 +811,17 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, file, st, stf, stp + WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, file, st, stf, stp + WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, file, st, stf, stp RETURN DISTINCT p.id as id, p.participant_id as participant_id, @@ -810,8 +829,7 @@ Indices: p.race as race_str, p.sex_at_birth as sex_at_birth, apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, From be35c030baa28be966b3d7484c9285f94ca47579 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:25:28 -0400 Subject: [PATCH 07/23] update diagnosis query --- config/es_indices_ccdi_model.yml | 65 ++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 5faa995..fe1369c 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -883,8 +883,17 @@ Indices: type: keyword study_name: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword sample_file_filters: type: nested properties: @@ -1051,7 +1060,17 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, st, stf, stp, dg + WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp, dg + WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp, dg RETURN DISTINCT dg.id as id, p.id as pid, @@ -1074,8 +1093,7 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, apoc.coll.union(cell_line_pdx_file_filters, general_file_filters) + participant_clinical_measure_file_filters + participant_radiology_file_filters AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files @@ -1147,7 +1165,17 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, st, stf, stp + WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp + WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp RETURN DISTINCT dg.id as id, p.id as pid, @@ -1170,8 +1198,7 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, sample_file_filter AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files @@ -1223,7 +1250,17 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, st, stf, stp + WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp + WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp RETURN DISTINCT dg.id as id, p.id as pid, @@ -1246,8 +1283,7 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, sample_file_filter AS sample_file_filters, size(files) as file_count, files as files @@ -1310,7 +1346,12 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - null as last_known_survival_status, + COLLECT(DISTINCT { + last_known_survival_status: null, + age_at_event_free_survival_status: null, + event_free_survival_status: null, + first_event: null, + }) AS survival_filters, sample_file_filter AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files From 17707b8ed7b08daf643e29c7a191719bd098117c Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Tue, 8 Oct 2024 10:34:57 -0400 Subject: [PATCH 08/23] create survivals --- bento | 2 +- config/es_indices_ccdi_model.yml | 420 ++++++++++++++++++++++++++++++- 2 files changed, 411 insertions(+), 11 deletions(-) diff --git a/bento b/bento index d644aac..1fda519 160000 --- a/bento +++ b/bento @@ -1 +1 @@ -Subproject commit d644aac1198ad56b9dc2a7e95f8173f6eae271e6 +Subproject commit 1fda5197855eabb4884d89231a34550d36bb606d diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index fe1369c..5551d89 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -838,6 +838,381 @@ Indices: COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files + - index_name: survivals + type: neo4j + mapping: + id: + type: keyword + participant_id: + type: keyword + normalizer: lowercase + race: + type: keyword + race_str: + type: keyword + sex_at_birth: + type: keyword + alternate_participant_id: + type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword + sample_diagnosis_file_filters: + type: nested + properties: + sample_anatomic_site: + type: keyword + participant_age_at_collection: + type: integer + sample_tumor_status: + type: keyword + tumor_classification: + type: keyword + age_at_diagnosis: + type: integer + diagnosis_anatomic_site: + type: keyword + disease_phase: + type: keyword + diagnosis_classification_system: + type: keyword + diagnosis_basis: + type: keyword + tumor_grade_source: + type: keyword + tumor_stage_source: + type: keyword + diagnosis: + type: keyword + assay_method: + type: keyword + file_type: + type: keyword + library_selection: + type: keyword + library_source_material: + type: keyword + library_source_molecule: + type: keyword + library_strategy: + type: keyword + study_id: + type: keyword + dbgap_accession: + type: keyword + study_acronym: + type: keyword + study_name: + type: keyword + file_count: + type: integer + files: + type: text + fields: + keyword: + type: keyword + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_queries: + - query: | + MATCH (p:participant) + optional match (p)<--(sm:sample) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with distinct p, sm, file + with p, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) as sample_clinical_radiology_file_filter + optional match (p)<--(sm:sample)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_clinical_radiology_file_filter, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) as sample_sequencing_cytogenomic_pathology_methylation_file_filter + with p, apoc.coll.union(sample_clinical_radiology_file_filter, sample_sequencing_cytogenomic_pathology_methylation_file_filter) as sample_file_filters + optional match (p)<--(dg:diagnosis) + with p, sample_file_filters, dg + unwind sample_file_filters as sample_file_filter + with p, collect(apoc.map.merge(sample_file_filter, { + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis + })) as sample_diagnosis_file_filter + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (sm)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm1)<--(dg:diagnosis) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm2)<--(dg:diagnosis) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm1)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_3 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_3) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_4 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_4) as sample_diagnosis_file_filter + OPTIONAL MATCH (p)<-[*..4]-(file) + WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) + OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_synonym]-(sy:synonym) + OPTIONAL MATCH (st:study)<-[:of_participant]-(p) + OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) + OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) + WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, file, st, stf, stp + WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, file, st, stf, stp + RETURN DISTINCT + p.id as id, + p.participant_id as participant_id, + apoc.text.split(p.race, ';') as race, + p.race as race_str, + p.sex_at_birth as sex_at_birth, + apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, + survival_filters as survival_filters, + sample_diagnosis_file_filter AS sample_diagnosis_file_filters, + st.study_id as study_id, + st.dbgap_accession as dbgap_accession, + st.study_acronym as study_acronym, + st.study_name as study_name, + COUNT(DISTINCT file.id) as file_count, + COLLECT(DISTINCT file.id) as files + + - index_name: diagnosis type: neo4j mapping: @@ -1520,8 +1895,17 @@ Indices: type: keyword library_strategy: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword file_count: type: integer direct_file_count: @@ -1633,12 +2017,19 @@ Indices: direct_file_count: COUNT(DISTINCT direct_file.id) }) AS opensearch_data OPTIONAL MATCH (sm)-[*..3]->(:participant)<-[:of_survival]-(su:survival) - WITH sm, opensearch_data, COLLECT(DISTINCT su.last_known_survival_status) as vital_status + WITH sm, opensearch_data, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status + WITH sm, opensearch_data, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters WITH sm, apoc.map.merge(opensearch_data, { - last_known_survival_status: CASE - WHEN 'Dead' IN vital_status THEN ['Dead'] - ELSE vital_status - END + survival_filters: survival_filters }) AS opensearch_data return opensearch_data page_size: 500 @@ -1678,7 +2069,7 @@ Indices: tumor_stage_source: dg.tumor_stage_source, diagnosis: dg.diagnosis }) AS diagnosis_filters, - null as last_known_survival_status, + null AS survival_filters, CASE COLLECT(file) WHEN [] THEN [] ELSE COLLECT(DISTINCT { assay_method: CASE LABELS(file)[0] @@ -1817,8 +2208,17 @@ Indices: type: keyword diagnosis: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword library_selection: type: keyword fields: From d7e77615f32bb27be72a40c6956a035aea07467d Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:36:13 -0400 Subject: [PATCH 09/23] update file --- config/es_indices_ccdi_model.yml | 114 ++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 25 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 5551d89..dce5eec 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -2172,8 +2172,18 @@ Indices: type: keyword diagnosis: type: keyword - last_known_survival_status: - type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword + participant_filters: type: nested properties: @@ -2338,6 +2348,18 @@ Indices: OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) + with file, sample_diagnosis_filter,COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status + with file, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters + RETURN DISTINCT file.id as id, p.id as pid, @@ -2367,8 +2389,7 @@ Indices: sex_at_birth: p.sex_at_birth }) AS participant_filters, sample_diagnosis_filter AS sample_diagnosis_filters, - case when 'Dead' in COLLECT(DISTINCT su.last_known_survival_status) then ['Dead'] - else COLLECT(DISTINCT su.last_known_survival_status) end as last_known_survival_status, + survival_filters as survival_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -2461,15 +2482,23 @@ Indices: tumor_grade_source: dg.tumor_grade_source, tumor_stage_source: dg.tumor_stage_source, diagnosis: dg.diagnosis - })) AS sample_diagnosis_filter_6, COLLECT(DISTINCT su.last_known_survival_status) as vital_status - with file, p, apoc.coll.union(sample_diagnosis_filter_5, sample_diagnosis_filter_6) as sample_diagnosis_filter, vital_status + })) AS sample_diagnosis_filter_6, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status + with file, p, apoc.coll.union(sample_diagnosis_filter_5, sample_diagnosis_filter_6) as sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters with file, collect(DISTINCT { participant_id: p.participant_id, race: apoc.text.split(p.race, ';'), sex_at_birth: p.sex_at_birth, sample_diagnosis_filters: sample_diagnosis_filter, - last_known_survival_status: case when 'Dead' in vital_status then ['Dead'] - else vital_status end + survival_filters: survival_filters }) as combined_filter_1 MATCH (st:study)<-[:of_clinical_measure_file]-(file) OPTIONAL MATCH (st)<--(cl)<--(sm:sample) @@ -2494,7 +2523,7 @@ Indices: race: null, sex_at_birth: null, sample_diagnosis_filters: sample_diagnosis_filter, - last_known_survival_status: null + survival_filters: survival_filters }) as combined_filter_2 with file, apoc.coll.union(combined_filter_1, combined_filter_2) as combined_filter MATCH (st:study)<-[:of_clinical_measure_file]-(file) @@ -2523,7 +2552,7 @@ Indices: combined_filter as combined_filters, null as participant_filters, null as sample_diagnosis_filters, - null as last_known_survival_status, + null as survival_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -2632,7 +2661,17 @@ Indices: OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, stf, stp + with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp + with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp RETURN DISTINCT file.id as id, p.id as pid, @@ -2666,8 +2705,7 @@ Indices: sex_at_birth: p.sex_at_birth }) AS participant_filters, sample_diagnosis_filter AS sample_diagnosis_filters, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_selection ELSE null END AS library_selection, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_source_material @@ -2730,7 +2768,7 @@ Indices: tumor_stage_source: dg.tumor_stage_source, diagnosis: dg.diagnosis }) AS sample_diagnosis_filters, - null as last_known_survival_status, + null as survival_filters, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_selection ELSE null END AS library_selection, @@ -2838,7 +2876,17 @@ Indices: OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, stf, stp + with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp + with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -2862,9 +2910,7 @@ Indices: race: apoc.text.split(p.race, ';'), sex_at_birth: p.sex_at_birth }) AS participant_filters, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, - sample_diagnosis_filter AS sample_diagnosis_filters, + survival_filters as survival_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -2876,7 +2922,17 @@ Indices: OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with sm, p, st, dg, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, stf, stp + with sm, p, st, dg, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp + with sm, p, st, dg, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -2914,8 +2970,7 @@ Indices: tumor_stage_source: dg.tumor_stage_source, diagnosis: dg.diagnosis }) AS sample_diagnosis_filters, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -2947,7 +3002,7 @@ Indices: null as files, null as combined_filters, null AS participant_filters, - null as last_known_survival_status, + null as survival_filters, COLLECT(DISTINCT { sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), participant_age_at_collection: sm.participant_age_at_collection, @@ -2973,7 +3028,17 @@ Indices: OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with p, st, dg, COLLECT(DISTINCT su.last_known_survival_status) as vital_status, stf, stp + with p, st, dg, COLLECT(DISTINCT CASE + WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' + ELSE su.last_known_survival_status + END) AS last_known_survival_status, + COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, + COLLECT(DISTINCT su.first_event) as first_event, + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp + with p, st, dg, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, + event_free_survival_status: event_free_survival_status, + first_event: first_event, + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -3011,8 +3076,7 @@ Indices: tumor_stage_source: dg.tumor_stage_source, diagnosis_classification: dg.diagnosis_classification }) AS sample_diagnosis_filters, - case when 'Dead' in vital_status then ['Dead'] - else vital_status end as last_known_survival_status, + survival_filters as survival_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, From c22da39ee7c99fd21da1b2b43bf9a85396c75659 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:56:45 -0400 Subject: [PATCH 10/23] add treatments indices --- config/es_indices_ccdi_model.yml | 604 ++++++++++++++++++++++++++++--- 1 file changed, 556 insertions(+), 48 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index dce5eec..2db6f7a 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -24,6 +24,22 @@ Indices: type: keyword first_event: type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer sample_diagnosis_file_filters: type: nested properties: @@ -332,21 +348,21 @@ Indices: }) AS sample_diagnosis_filters_4 with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_4) as sample_diagnosis_file_filter OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) WITH p, sample_diagnosis_file_filter, st, stf, stp, - COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status - WITH p, sample_diagnosis_file_filter, st, stf, stp, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters + COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters RETURN DISTINCT p.id as id, p.id as pid, @@ -354,6 +370,8 @@ Indices: apoc.text.split(p.race, ';') as race, p.sex_at_birth as sex_at_birth, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, @@ -397,6 +415,11 @@ Indices: event_free_survival_status: null, first_event: null, }) AS survival_filters, + COLLECT(DISTINCT{treatment_type: null, + treatment_agent: null, + age_at_treatment_start: null}) as treatment_filters, + COLLECT(DISTINCT{response_category: null, + age_at_response: null}) as treatment_response_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, @@ -423,6 +446,11 @@ Indices: null as event_free_survival_status, null as first_event, }) AS survival_filters, + COLLECT(DISTINCT{null as treatment_type, + tnull as treatment_agent, + null as age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{null as response_category, + null as age_at_response}) as treatment_response_filters COLLECT(DISTINCT { sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), participant_age_at_collection: sm.participant_age_at_collection, @@ -491,6 +519,22 @@ Indices: type: keyword first_event: type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer sample_diagnosis_file_filters: type: nested properties: @@ -807,21 +851,22 @@ Indices: OPTIONAL MATCH (p)<-[*..4]-(file) WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (p)<-[:of_synonym]-(sy:synonym) OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, file, st, stf, stp - WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, file, st, stf, stp + WITH p, sy, sample_diagnosis_file_filter, + COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters,file, st, stf, stp RETURN DISTINCT p.id as id, p.participant_id as participant_id, @@ -830,6 +875,8 @@ Indices: p.sex_at_birth as sex_at_birth, apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, @@ -848,11 +895,392 @@ Indices: normalizer: lowercase race: type: keyword - race_str: - type: keyword sex_at_birth: type: keyword - alternate_participant_id: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer + sample_diagnosis_file_filters: + type: nested + properties: + sample_anatomic_site: + type: keyword + participant_age_at_collection: + type: integer + sample_tumor_status: + type: keyword + tumor_classification: + type: keyword + age_at_diagnosis: + type: integer + diagnosis_anatomic_site: + type: keyword + disease_phase: + type: keyword + diagnosis_classification_system: + type: keyword + diagnosis_basis: + type: keyword + tumor_grade_source: + type: keyword + tumor_stage_source: + type: keyword + diagnosis: + type: keyword + assay_method: + type: keyword + file_type: + type: keyword + library_selection: + type: keyword + library_source_material: + type: keyword + library_source_molecule: + type: keyword + library_strategy: + type: keyword + study_id: + type: keyword + dbgap_accession: + type: keyword + study_acronym: + type: keyword + study_name: + type: keyword + file_count: + type: integer + files: + type: text + fields: + keyword: + type: keyword + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_queries: + - query: | + MATCH (p:participant) + optional match (p)<--(sm:sample) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with distinct p, sm, file + with p, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) as sample_clinical_radiology_file_filter + optional match (p)<--(sm:sample)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_clinical_radiology_file_filter, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) as sample_sequencing_cytogenomic_pathology_methylation_file_filter + with p, apoc.coll.union(sample_clinical_radiology_file_filter, sample_sequencing_cytogenomic_pathology_methylation_file_filter) as sample_file_filters + optional match (p)<--(dg:diagnosis) + with p, sample_file_filters, dg + unwind sample_file_filters as sample_file_filter + with p, collect(apoc.map.merge(sample_file_filter, { + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis + })) as sample_diagnosis_file_filter + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (sm)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm1)<--(dg:diagnosis) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm2)<--(dg:diagnosis) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm1)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_3 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_3) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_4 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_4) as sample_diagnosis_file_filter + OPTIONAL MATCH (p)<-[*..4]-(file) + WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) + OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) + OPTIONAL MATCH (p)<-[:of_synonym]-(sy:synonym) + OPTIONAL MATCH (st:study)<-[:of_participant]-(p) + OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) + OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) + WITH su, p, sy, sample_diagnosis_file_filter, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp + RETURN DISTINCT + p.id as id, + p.participant_id as participant_id, + apoc.text.split(p.race, ';') as race, + p.race as race_str, + p.sex_at_birth as sex_at_birth, + apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, + su.last_known_survival_status as last_known_survival_status, + su.age_at_event_free_survival_status as age_at_event_free_survival_status, + su.event_free_survival_status as event_free_survival_status, + su.first_event as first_event, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, + sample_diagnosis_file_filter AS sample_diagnosis_file_filters, + st.study_id as study_id, + st.dbgap_accession as dbgap_accession, + st.study_acronym as study_acronym, + st.study_name as study_name, + COUNT(DISTINCT file.id) as file_count, + COLLECT(DISTINCT file.id) as files + + - index_name: treatments + type: neo4j + mapping: + id: + type: keyword + participant_id: + type: keyword + normalizer: lowercase + race: + type: keyword + sex_at_birth: type: keyword survival_filters: type: nested @@ -865,6 +1293,19 @@ Indices: type: keyword first_event: type: keyword + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer sample_diagnosis_file_filters: type: nested properties: @@ -1181,21 +1622,18 @@ Indices: OPTIONAL MATCH (p)<-[*..4]-(file) WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (p)<-[:of_synonym]-(sy:synonym) OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, file, st, stf, stp - WITH p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, file, st, stf, stp + WITH su, p, sy, sample_diagnosis_file_filter, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp RETURN DISTINCT p.id as id, p.participant_id as participant_id, @@ -1203,7 +1641,12 @@ Indices: p.race as race_str, p.sex_at_birth as sex_at_birth, apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, - survival_filters as survival_filters, + su.last_known_survival_status as last_known_survival_status, + su.age_at_event_free_survival_status as age_at_event_free_survival_status, + su.event_free_survival_status as event_free_survival_status, + su.first_event as first_event, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, @@ -1213,6 +1656,7 @@ Indices: COLLECT(DISTINCT file.id) as files + - index_name: diagnosis type: neo4j mapping: @@ -1269,6 +1713,22 @@ Indices: type: keyword first_event: type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer sample_file_filters: type: nested properties: @@ -1431,7 +1891,9 @@ Indices: OPTIONAL MATCH (p)<-[*..4]-(file) WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) - with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, dg, file, su + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) + with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, dg, file, su. tm, tr OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) @@ -1441,11 +1903,16 @@ Indices: END) AS last_known_survival_status, COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp, dg + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp, dg WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, event_free_survival_status: event_free_survival_status, first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp, dg + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp, dg, treatment_filters,treatment_response_filters RETURN DISTINCT dg.id as id, p.id as pid, @@ -1468,7 +1935,9 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - survival_filters as survival_filters, + survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, apoc.coll.union(cell_line_pdx_file_filters, general_file_filters) + participant_clinical_measure_file_filters + participant_radiology_file_filters AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files @@ -1537,6 +2006,8 @@ Indices: optional match (sm)<--(file) where (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) @@ -1546,11 +2017,16 @@ Indices: END) AS last_known_survival_status, COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, event_free_survival_status: event_free_survival_status, first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp,treatment_filters,treatment_response_filters RETURN DISTINCT dg.id as id, p.id as pid, @@ -1573,7 +2049,9 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - survival_filters as survival_filters, + survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, sample_file_filter AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files @@ -1622,6 +2100,8 @@ Indices: with dg, sample_file_filter, collect(distinct file.id) as files, apoc.coll.union(collect(distinct sm1.id), collect(distinct sm.id)) as sid, apoc.coll.union(collect(distinct sm1.sample_id), collect(distinct sm.sample_id)) as sample_id optional match (p:participant)<-[*..4]-(dg) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) @@ -1631,11 +2111,16 @@ Indices: END) AS last_known_survival_status, COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, st, stf, stp + COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, event_free_survival_status: event_free_survival_status, first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp + age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp,treatment_filters,treatment_response_filters RETURN DISTINCT dg.id as id, p.id as pid, @@ -1658,7 +2143,9 @@ Indices: st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, st.study_name as study_name, - survival_filters as survival_filters, + survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, sample_file_filter AS sample_file_filters, size(files) as file_count, files as files @@ -1726,7 +2213,12 @@ Indices: age_at_event_free_survival_status: null, event_free_survival_status: null, first_event: null, - }) AS survival_filters, + }) AS survival_filters, + COLLECT(DISTINCT{treatment_type: null, + treatment_agent: null, + age_at_treatment_start: null}) as treatment_filters, + COLLECT(DISTINCT{response_category: null, + age_at_response: null}) as treatment_response_filters, sample_file_filter AS sample_file_filters, COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files @@ -1906,6 +2398,22 @@ Indices: type: keyword first_event: type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer file_count: type: integer direct_file_count: From bbbd894f3da4f73ce16ee9e9299ebb929ee1a08e Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:28:53 -0400 Subject: [PATCH 11/23] add treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 647 +++++++++++++++++++++++++------ 1 file changed, 523 insertions(+), 124 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 2db6f7a..25fbf23 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -1628,10 +1628,10 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH su, p, sy, sample_diagnosis_file_filter, - COLLECT(DISTINCT{treatment_type: tm.treatment_type, - treatment_agent: tm.treatment_agent, - age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + WITH tm, p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp RETURN DISTINCT @@ -1641,10 +1641,10 @@ Indices: p.race as race_str, p.sex_at_birth as sex_at_birth, apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, - su.last_known_survival_status as last_known_survival_status, - su.age_at_event_free_survival_status as age_at_event_free_survival_status, - su.event_free_survival_status as event_free_survival_status, - su.first_event as first_event, + survival_filters as survival_filters, + tm.treatment_type as treatment_type, + tm.treatment_agent as treatment_agent, + tm.age_at_treatment_start as age_at_treatment_start, treatment_filters as treatment_filters, treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, @@ -1655,7 +1655,390 @@ Indices: COUNT(DISTINCT file.id) as file_count, COLLECT(DISTINCT file.id) as files - + - index_name: treatment_responses + type: neo4j + mapping: + id: + type: keyword + participant_id: + type: keyword + normalizer: lowercase + race: + type: keyword + sex_at_birth: + type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + response_category: + type: keyword + age_at_response: + type: integer + sample_diagnosis_file_filters: + type: nested + properties: + sample_anatomic_site: + type: keyword + participant_age_at_collection: + type: integer + sample_tumor_status: + type: keyword + tumor_classification: + type: keyword + age_at_diagnosis: + type: integer + diagnosis_anatomic_site: + type: keyword + disease_phase: + type: keyword + diagnosis_classification_system: + type: keyword + diagnosis_basis: + type: keyword + tumor_grade_source: + type: keyword + tumor_stage_source: + type: keyword + diagnosis: + type: keyword + assay_method: + type: keyword + file_type: + type: keyword + library_selection: + type: keyword + library_source_material: + type: keyword + library_source_molecule: + type: keyword + library_strategy: + type: keyword + study_id: + type: keyword + dbgap_accession: + type: keyword + study_acronym: + type: keyword + study_name: + type: keyword + file_count: + type: integer + files: + type: text + fields: + keyword: + type: keyword + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_queries: + - query: | + MATCH (p:participant) + optional match (p)<--(sm:sample) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with distinct p, sm, file + with p, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) as sample_clinical_radiology_file_filter + optional match (p)<--(sm:sample)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_clinical_radiology_file_filter, collect(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) as sample_sequencing_cytogenomic_pathology_methylation_file_filter + with p, apoc.coll.union(sample_clinical_radiology_file_filter, sample_sequencing_cytogenomic_pathology_methylation_file_filter) as sample_file_filters + optional match (p)<--(dg:diagnosis) + with p, sample_file_filters, dg + unwind sample_file_filters as sample_file_filter + with p, collect(apoc.map.merge(sample_file_filter, { + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis + })) as sample_diagnosis_file_filter + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (sm)<--(file) + where (file: sequencing_file or file: methylation_array_file or file: pathology_file or file: cytogenomic_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional match (p)<--(sm:sample)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm1)<--(dg:diagnosis) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_1 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_1) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(file) + WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + optional Match (sm2)<--(dg:diagnosis) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: file.file_type, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_filters_2 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_2) as sample_diagnosis_file_filter + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm1)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filter, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), + participant_age_at_collection: sm1.participant_age_at_collection, + sample_tumor_status: sm1.sample_tumor_status, + tumor_classification: sm1.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_3 + with p, apoc.coll.union(sample_diagnosis_file_filter, sample_diagnosis_filters_3) as sample_diagnosis_file_filters + optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample) + WHERE (cl: cell_line or cl: pdx) + optional Match (sm2)<--(dg:diagnosis) + optional match (p)<--(file) + where (file: clinical_measure_file or file: radiology_file) + with p, sample_diagnosis_file_filters, COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm2.anatomic_site, ';'), + participant_age_at_collection: sm2.participant_age_at_collection, + sample_tumor_status: sm2.sample_tumor_status, + tumor_classification: sm2.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE labels(file)[0] WHEN 'clinical_measure_file' THEN 'Clinical data' + WHEN 'radiology_file' THEN 'Radiology imaging' + ELSE null END, + file_type: file.file_type, + library_selection: null, + library_source_material: null, + library_source_molecule: null, + library_strategy: null + }) AS sample_diagnosis_filters_4 + with p, apoc.coll.union(sample_diagnosis_file_filters, sample_diagnosis_filters_4) as sample_diagnosis_file_filter + OPTIONAL MATCH (p)<-[*..4]-(file) + WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) + OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) + OPTIONAL MATCH (p)<-[:of_synonym]-(sy:synonym) + OPTIONAL MATCH (st:study)<-[:of_participant]-(p) + OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) + OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) + WITH tr, p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, file, st, stf, stp + RETURN DISTINCT + p.id as id, + p.participant_id as participant_id, + apoc.text.split(p.race, ';') as race, + p.race as race_str, + p.sex_at_birth as sex_at_birth, + apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, + survival_filters as survival_filters, + tr.response_category as response_category, + tr.age_at_response as age_at_response, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, + sample_diagnosis_file_filter AS sample_diagnosis_file_filters, + st.study_id as study_id, + st.dbgap_accession as dbgap_accession, + st.study_acronym as study_acronym, + st.study_name as study_name, + COUNT(DISTINCT file.id) as file_count, + COLLECT(DISTINCT file.id) as files - index_name: diagnosis type: neo4j @@ -1897,22 +2280,15 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, - COLLECT(DISTINCT{treatment_type: tm.treatment_type, + WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp, dg - WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp, dg, treatment_filters,treatment_response_filters RETURN DISTINCT dg.id as id, p.id as pid, @@ -2011,22 +2387,15 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, + WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, COLLECT(DISTINCT{response_category: tr.response_category, - age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp - WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp,treatment_filters,treatment_response_filters + age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp RETURN DISTINCT dg.id as id, p.id as pid, @@ -2105,22 +2474,15 @@ Indices: OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, + WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, COLLECT(DISTINCT{response_category: tr.response_category, - age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp - WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, st, stf, stp,treatment_filters,treatment_response_filters + age_at_response: tr.age_at_response}) as treatment_response_filters, st, stf, stp RETURN DISTINCT dg.id as id, p.id as pid, @@ -2525,20 +2887,26 @@ Indices: direct_file_count: COUNT(DISTINCT direct_file.id) }) AS opensearch_data OPTIONAL MATCH (sm)-[*..3]->(:participant)<-[:of_survival]-(su:survival) - WITH sm, opensearch_data, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status - WITH sm, opensearch_data, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters + WITH sm, opensearch_data, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters WITH sm, apoc.map.merge(opensearch_data, { survival_filters: survival_filters }) AS opensearch_data + OPTIONAL MATCH (sm)-[*..3]->(:participant)<-[:of_treatment]-(tm:treatment) + WITH sm, opensearch_data, COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters + WITH sm, apoc.map.merge(opensearch_data, { + treatment_filters: treatment_filters + }) AS opensearch_data + OPTIONAL MATCH (sm)-[*..3]->(:participant)<-[:of_treatment_response]-(tr:treatment_response) + WITH sm, opensearch_data, COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters + WITH sm, apoc.map.merge(opensearch_data, { + treatment_response_filters: treatment_response_filters + }) AS opensearch_data return opensearch_data page_size: 500 - query: | @@ -2578,6 +2946,8 @@ Indices: diagnosis: dg.diagnosis }) AS diagnosis_filters, null AS survival_filters, + null as treatment_filters. + null as treatment_response_filters, CASE COLLECT(file) WHEN [] THEN [] ELSE COLLECT(DISTINCT { assay_method: CASE LABELS(file)[0] @@ -2737,6 +3107,22 @@ Indices: type: keyword first_event: type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer library_selection: type: keyword fields: @@ -2854,20 +3240,19 @@ Indices: MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with file, sample_diagnosis_filter,COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status - with file, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters - + with file, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters RETURN DISTINCT file.id as id, p.id as pid, @@ -2964,6 +3349,8 @@ Indices: optional MATCH (p)<-[:of_sample]-(sm1:sample)<-[*2..2]-(sm:sample) OPTIONAL MATCH (sm)<--(dg:diagnosis) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) with file, p, sample_diagnosis_filter_5, apoc.coll.union(COLLECT(DISTINCT { sample_anatomic_site: apoc.text.split(sm1.anatomic_site, ';'), participant_age_at_collection: sm1.participant_age_at_collection, @@ -2990,23 +3377,23 @@ Indices: tumor_grade_source: dg.tumor_grade_source, tumor_stage_source: dg.tumor_stage_source, diagnosis: dg.diagnosis - })) AS sample_diagnosis_filter_6, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status - with file, p, apoc.coll.union(sample_diagnosis_filter_5, sample_diagnosis_filter_6) as sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters + })) AS sample_diagnosis_filter_6, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters with file, collect(DISTINCT { participant_id: p.participant_id, race: apoc.text.split(p.race, ';'), sex_at_birth: p.sex_at_birth, sample_diagnosis_filters: sample_diagnosis_filter, - survival_filters: survival_filters + survival_filters: survival_filters, + treatment_filters: treatment_filters, + treatment_response_filters: treatment_response_filters }) as combined_filter_1 MATCH (st:study)<-[:of_clinical_measure_file]-(file) OPTIONAL MATCH (st)<--(cl)<--(sm:sample) @@ -3167,19 +3554,19 @@ Indices: MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp - with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp + with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, stf, stp RETURN DISTINCT file.id as id, p.id as pid, @@ -3214,6 +3601,8 @@ Indices: }) AS participant_filters, sample_diagnosis_filter AS sample_diagnosis_filters, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_selection ELSE null END AS library_selection, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_source_material @@ -3277,6 +3666,8 @@ Indices: diagnosis: dg.diagnosis }) AS sample_diagnosis_filters, null as survival_filters, + null as treatment_filters, + null as treatment_response_filters, CASE LABELS(file)[0] WHEN 'sequencing_file' THEN file.library_selection ELSE null END AS library_selection, @@ -3382,19 +3773,19 @@ Indices: with p, sm1, sm, apoc.coll.union(sample_diagnosis_filter_3, sample_diagnosis_filter_4) as sample_diagnosis_filter MATCH (st:study)<--(p)<--(sm1)<-[*2..2]-(sm) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp - with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp + with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -3419,6 +3810,8 @@ Indices: sex_at_birth: p.sex_at_birth }) AS participant_filters, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -3428,19 +3821,19 @@ Indices: where not ((sm)<-[*..3]-(:sequencing_file)) and not ((sm)<-[*..3]-(:cytogenomic_file)) and not ((sm)<-[*..3]-(:pathology_file)) and not ((sm)<-[*..3]-(:methylation_array_file)) and not ((p)<--(:radiology_file)) and not ((p)<--(:clinical_measure_file)) OPTIONAL MATCH (p)<-[*..2]-(dg:diagnosis) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with sm, p, st, dg, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp - with sm, p, st, dg, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp + with sm, p, st, dg, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -3479,6 +3872,8 @@ Indices: diagnosis: dg.diagnosis }) AS sample_diagnosis_filters, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -3511,6 +3906,8 @@ Indices: null as combined_filters, null AS participant_filters, null as survival_filters, + null as treatment_filters, + null as treatment_response_filters, COLLECT(DISTINCT { sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), participant_age_at_collection: sm.participant_age_at_collection, @@ -3534,19 +3931,19 @@ Indices: where not ((p)<--(:sample)) and not ((p)<--(:radiology_file)) and not ((p)<--(:clinical_measure_file)) OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with p, st, dg, COLLECT(DISTINCT CASE - WHEN su.last_known_survival_status = 'Dead' THEN 'Dead' - ELSE su.last_known_survival_status - END) AS last_known_survival_status, - COLLECT(DISTINCT su.event_free_survival_status) as event_free_survival_status, - COLLECT(DISTINCT su.first_event) as first_event, - COLLECT(DISTINCT su.age_at_event_free_survival_status) as age_at_event_free_survival_status, stf, stp - with p, st, dg, COLLECT(DISTINCT {last_known_survival_status: last_known_survival_status, - event_free_survival_status: event_free_survival_status, - first_event: first_event, - age_at_event_free_survival_status: age_at_event_free_survival_status} ) AS survival_filters, stf, stp + with p, st, dg, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + event_free_survival_status: su.event_free_survival_status, + first_event: su.first_event, + age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + COLLECT(DISTINCT{treatment_type: tm.treatment_type, + treatment_agent: tm.treatment_agent, + age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, + COLLECT(DISTINCT{response_category: tr.response_category, + age_at_response: tr.age_at_response}) as treatment_response_filters, stf, stp RETURN DISTINCT null as id, p.id as pid, @@ -3585,6 +3982,8 @@ Indices: diagnosis_classification: dg.diagnosis_classification }) AS sample_diagnosis_filters, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, From 36d9cab06b6fa56d51d9fe7984e19b276a17d330 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 13:03:45 -0400 Subject: [PATCH 12/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 25fbf23..39c43a7 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -1645,7 +1645,6 @@ Indices: tm.treatment_type as treatment_type, tm.treatment_agent as treatment_agent, tm.age_at_treatment_start as age_at_treatment_start, - treatment_filters as treatment_filters, treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, @@ -2031,7 +2030,6 @@ Indices: tr.response_category as response_category, tr.age_at_response as age_at_response, treatment_filters as treatment_filters, - treatment_response_filters as treatment_response_filters, sample_diagnosis_file_filter AS sample_diagnosis_file_filters, st.study_id as study_id, st.dbgap_accession as dbgap_accession, From 4726022cb767804b141ff02bcd45a1c4809fa99a Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 14:18:13 -0400 Subject: [PATCH 13/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 39c43a7..e61086f 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -2272,9 +2272,9 @@ Indices: OPTIONAL MATCH (p)<-[*..4]-(file) WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (p)<-[:of_survival]-(su:survival) - OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) - OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) - with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, dg, file, su. tm, tr + OPTIONAL MATCH (p)<-[:of_treatment]-(tm:treatment) + OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) + with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, dg, file, su, tm, tr OPTIONAL MATCH (st:study)<-[:of_participant]-(p) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) @@ -3242,7 +3242,7 @@ Indices: OPTIONAL MATCH (p)<-[:of_treatment_response]-(tr:treatment_response) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) - with file, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, + with file, p, st, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, From 383c4bdca1897f5648d04b515d4b09902ffc85e8 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 14:27:59 -0400 Subject: [PATCH 14/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index e61086f..83b6eba 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3384,6 +3384,7 @@ Indices: age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters + with file, p, apoc.coll.union(sample_diagnosis_filter_5, sample_diagnosis_filter_6) as sample_diagnosis_filter, survival_filters, treatment_filters, treatment_response_filters with file, collect(DISTINCT { participant_id: p.participant_id, race: apoc.text.split(p.race, ';'), From 1214adf9e2ad33c56fc257aaf4b11a451bbc447e Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 14:40:55 -0400 Subject: [PATCH 15/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 83b6eba..da6bc41 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3416,8 +3416,7 @@ Indices: participant_id: null, race: null, sex_at_birth: null, - sample_diagnosis_filters: sample_diagnosis_filter, - survival_filters: survival_filters + sample_diagnosis_filters: sample_diagnosis_filter }) as combined_filter_2 with file, apoc.coll.union(combined_filter_1, combined_filter_2) as combined_filter MATCH (st:study)<-[:of_clinical_measure_file]-(file) From 2769d28483f5710e5863c783192005aa7890c91f Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:21:30 -0400 Subject: [PATCH 16/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index da6bc41..7708437 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -413,7 +413,7 @@ Indices: last_known_survival_status: null, age_at_event_free_survival_status: null, event_free_survival_status: null, - first_event: null, + first_event: null }) AS survival_filters, COLLECT(DISTINCT{treatment_type: null, treatment_agent: null, @@ -444,7 +444,7 @@ Indices: null as last_known_survival_status, null as age_at_event_free_survival_status, null as event_free_survival_status, - null as first_event, + null as first_event }) AS survival_filters, COLLECT(DISTINCT{null as treatment_type, tnull as treatment_agent, @@ -2572,7 +2572,7 @@ Indices: last_known_survival_status: null, age_at_event_free_survival_status: null, event_free_survival_status: null, - first_event: null, + first_event: null }) AS survival_filters, COLLECT(DISTINCT{treatment_type: null, treatment_agent: null, @@ -2944,7 +2944,7 @@ Indices: diagnosis: dg.diagnosis }) AS diagnosis_filters, null AS survival_filters, - null as treatment_filters. + null as treatment_filters, null as treatment_response_filters, CASE COLLECT(file) WHEN [] THEN [] ELSE COLLECT(DISTINCT { From 8a98aef766fa04d57ba718ba136523d42fe05dbd Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:25:38 -0400 Subject: [PATCH 17/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 7708437..1bc02c0 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3048,18 +3048,6 @@ Indices: type: keyword diagnosis: type: keyword - survival_filters: - type: nested - properties: - last_known_survival_status: - type: keyword - age_at_event_free_survival_status: - type: integer - event_free_survival_status: - type: keyword - first_event: - type: keyword - participant_filters: type: nested properties: @@ -3677,7 +3665,7 @@ Indices: WHEN 'sequencing_file' THEN file.library_strategy ELSE null END AS library_strategy - query: | - MATCH (st:study)<--(p:participant)<--(sm1:sample)<-[*2..2]-(sm:sample) + MATCH (st:study)<--(p:participant)<--(sm1:sample)<-[*2..2]-(sm:sample) where not ((sm)<--(:sequencing_file)) and not ((sm)<--(:cytogenomic_file)) and not ((sm)<--(:pathology_file)) and not ((sm)<--(:methylation_array_file)) and not ((p)<--(:radiology_file)) and not ((p)<--(:clinical_measure_file)) OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis) with p, sm1, sm, apoc.coll.union(COLLECT(DISTINCT { @@ -3846,7 +3834,7 @@ Indices: st.study_id AS study_id, st.dbgap_accession as dbgap_accession, st.study_acronym as study_acronym, - st.study_name as study_name, + st.study_name as study_name, p.participant_id AS participant_id, sm.sample_id AS sample_id, null as files, From b3eec61fceac2337c2f3b5490e35f2563e1d54f6 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Wed, 16 Oct 2024 17:47:27 -0400 Subject: [PATCH 18/23] fix treatments and treament_response indices --- config/es_indices_ccdi_model.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 1bc02c0..434a9a5 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3021,6 +3021,33 @@ Indices: type: keyword sex_at_birth: type: keyword + survival_filters: + type: nested + properties: + last_known_survival_status: + type: keyword + age_at_event_free_survival_status: + type: integer + event_free_survival_status: + type: keyword + first_event: + type: keyword + treatment_filters: + type: nested + properties: + treatment_type: + type: keyword + treatment_agent: + type: keyword + age_at_treatment_start: + type: integer + treatment_response_filters: + type: nested + properties: + response_category: + type: keyword + age_at_response: + type: integer sample_diagnosis_filters: type: nested properties: @@ -3269,6 +3296,8 @@ Indices: }) AS participant_filters, sample_diagnosis_filter AS sample_diagnosis_filters, survival_filters as survival_filters, + treatment_filters as treatment_filters, + treatment_response_filters as treatment_response_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, @@ -3434,6 +3463,8 @@ Indices: null as participant_filters, null as sample_diagnosis_filters, null as survival_filters, + null as treatment_filters, + null as treatment_response_filters, null AS library_selection, null AS library_source_material, null AS library_source_molecule, From 8a0271a7d10cdda8b3d99c3cf7acc795d27221bd Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:52:36 -0400 Subject: [PATCH 19/23] fix study_participant indice --- config/es_indices_ccdi_model.yml | 124 ++++++++++++++++--------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 434a9a5..ff5ad9d 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -427,69 +427,75 @@ Indices: - query: | MATCH (st:study) MATCH (st)<-[:of_cell_line|of_pdx]-(cl)<--(sm:sample) - Where (cl: cell_line or cl: pdx) - optional Match (sm)<--(dg:diagnosis) - optional Match (sm)<--(file) - WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) + WHERE (cl:cell_line OR cl:pdx) + OPTIONAL MATCH (sm)<--(dg:diagnosis) + OPTIONAL MATCH (sm)<--(file) + WHERE (file:sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:cytogenomic_file) OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel) OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding) WITH file, dg, sm, st, stf, stp RETURN DISTINCT - null as id, - null as pid, - null as participant_id, - null as race, - null as sex_at_birth, - COLLECT(DISTINCT { - null as last_known_survival_status, - null as age_at_event_free_survival_status, - null as event_free_survival_status, - null as first_event - }) AS survival_filters, - COLLECT(DISTINCT{null as treatment_type, - tnull as treatment_agent, - null as age_at_treatment_start}) as treatment_filters, - COLLECT(DISTINCT{null as response_category, - null as age_at_response}) as treatment_response_filters - COLLECT(DISTINCT { - sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), - participant_age_at_collection: sm.participant_age_at_collection, - sample_tumor_status: sm.sample_tumor_status, - tumor_classification: sm.tumor_classification, - age_at_diagnosis: dg.age_at_diagnosis, - diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), - disease_phase: dg.disease_phase, - diagnosis_classification_system: dg.diagnosis_classification_system, - diagnosis_basis: dg.diagnosis_basis, - tumor_grade_source: dg.tumor_grade_source, - tumor_stage_source: dg.tumor_stage_source, - diagnosis: dg.diagnosis, - assay_method: CASE LABELS(file)[0] - WHEN 'sequencing_file' THEN 'Sequencing' - WHEN 'cytogenomic_file' THEN 'Cytogenomic' - WHEN 'pathology_file' THEN 'Pathology imaging' - WHEN 'methylation_array_file' THEN 'Methylation array' - ELSE null END, - file_type: CASE LABELS(file)[0] - When null then null - ELSE file.file_type end, - library_selection: CASE LABELS(file)[0] - WHEN 'sequencing_file' THEN file.library_selection - ELSE null END, - library_source_material: CASE LABELS(file)[0] - WHEN 'sequencing_file' THEN file.library_source_material - ELSE null END, - library_source_molecule: CASE LABELS(file)[0] - WHEN 'sequencing_file' THEN file.library_source_molecule - ELSE null END, - library_strategy: CASE LABELS(file)[0] - WHEN 'sequencing_file' THEN file.library_strategy - ELSE null END - }) AS sample_diagnosis_file_filters, - st.study_id as study_id, - st.dbgap_accession as dbgap_accession, - st.study_acronym as study_acronym, - st.study_name as study_name + null AS id, + null AS pid, + null AS participant_id, + null AS race, + null AS sex_at_birth, + COLLECT(DISTINCT { + last_known_survival_status: null, + age_at_event_free_survival_status: null, + event_free_survival_status: null, + first_event: null + }) AS survival_filters, + COLLECT(DISTINCT { + treatment_type: null, + treatment_agent: null, + age_at_treatment_start: null + }) AS treatment_filters, + COLLECT(DISTINCT { + response_category: null, + age_at_response: null + }) AS treatment_response_filters, + COLLECT(DISTINCT { + sample_anatomic_site: apoc.text.split(sm.anatomic_site, ';'), + participant_age_at_collection: sm.participant_age_at_collection, + sample_tumor_status: sm.sample_tumor_status, + tumor_classification: sm.tumor_classification, + age_at_diagnosis: dg.age_at_diagnosis, + diagnosis_anatomic_site: apoc.text.split(dg.anatomic_site, ';'), + disease_phase: dg.disease_phase, + diagnosis_classification_system: dg.diagnosis_classification_system, + diagnosis_basis: dg.diagnosis_basis, + tumor_grade_source: dg.tumor_grade_source, + tumor_stage_source: dg.tumor_stage_source, + diagnosis: dg.diagnosis, + assay_method: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN 'Sequencing' + WHEN 'cytogenomic_file' THEN 'Cytogenomic' + WHEN 'pathology_file' THEN 'Pathology imaging' + WHEN 'methylation_array_file' THEN 'Methylation array' + ELSE null END, + file_type: CASE LABELS(file)[0] + WHEN null THEN null + ELSE file.file_type END, + library_selection: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_selection + ELSE null END, + library_source_material: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_material + ELSE null END, + library_source_molecule: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_source_molecule + ELSE null END, + library_strategy: CASE LABELS(file)[0] + WHEN 'sequencing_file' THEN file.library_strategy + ELSE null END + }) AS sample_diagnosis_file_filters, + st.study_id AS study_id, + st.dbgap_accession AS dbgap_accession, + st.study_acronym AS study_acronym, + st.study_name AS study_name + + - index_name: participants From 2f72a5aa3267a17db6fe114b494d51bd18c026ef Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:39:11 -0400 Subject: [PATCH 20/23] add sample_diagnosis_filter back --- config/es_indices_ccdi_model.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index ff5ad9d..2b8b82a 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3832,6 +3832,7 @@ Indices: race: apoc.text.split(p.race, ';'), sex_at_birth: p.sex_at_birth }) AS participant_filters, + sample_diagnosis_filter as sample_diagnosis_filter, survival_filters as survival_filters, treatment_filters as treatment_filters, treatment_response_filters as treatment_response_filters, From 7fcea228bfcafaeca743e1a4ed54afd720361f28 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Thu, 17 Oct 2024 15:13:39 -0400 Subject: [PATCH 21/23] fix spelling --- config/es_indices_ccdi_model.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 2b8b82a..dde3ecc 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -3832,7 +3832,7 @@ Indices: race: apoc.text.split(p.race, ';'), sex_at_birth: p.sex_at_birth }) AS participant_filters, - sample_diagnosis_filter as sample_diagnosis_filter, + sample_diagnosis_filter as sample_diagnosis_filters, survival_filters as survival_filters, treatment_filters as treatment_filters, treatment_response_filters as treatment_response_filters, From a397ea05940f1a20330095165f7aafb66f4c9f4b Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:25:57 -0400 Subject: [PATCH 22/23] update id --- config/es_indices_ccdi_model.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index dde3ecc..4e5031d 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -1256,7 +1256,8 @@ Indices: COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp RETURN DISTINCT - p.id as id, + su.id as id, + p.id as pid, p.participant_id as participant_id, apoc.text.split(p.race, ';') as race, p.race as race_str, @@ -1641,7 +1642,7 @@ Indices: COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp RETURN DISTINCT - p.id as id, + tm.id as id, p.participant_id as participant_id, apoc.text.split(p.race, ';') as race, p.race as race_str, @@ -2026,7 +2027,7 @@ Indices: treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, file, st, stf, stp RETURN DISTINCT - p.id as id, + tr.id as id, p.participant_id as participant_id, apoc.text.split(p.race, ';') as race, p.race as race_str, From 934065e357b00a6070c699040d6df8a3b42698c3 Mon Sep 17 00:00:00 2001 From: shawnwangnih <108429233+shawnwangnih@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:53:36 -0400 Subject: [PATCH 23/23] fix age_at_last_known_survival_status --- config/es_indices_ccdi_model.yml | 54 ++++++++++++++++---------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/config/es_indices_ccdi_model.yml b/config/es_indices_ccdi_model.yml index 4e5031d..c5f474f 100644 --- a/config/es_indices_ccdi_model.yml +++ b/config/es_indices_ccdi_model.yml @@ -18,7 +18,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -357,7 +357,7 @@ Indices: COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -411,7 +411,7 @@ Indices: }) AS sample_diagnosis_file_filters, COLLECT(DISTINCT { last_known_survival_status: null, - age_at_event_free_survival_status: null, + age_at_last_known_survival_status: null, event_free_survival_status: null, first_event: null }) AS survival_filters, @@ -442,7 +442,7 @@ Indices: null AS sex_at_birth, COLLECT(DISTINCT { last_known_survival_status: null, - age_at_event_free_survival_status: null, + age_at_last_known_survival_status: null, event_free_survival_status: null, first_event: null }) AS survival_filters, @@ -519,7 +519,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -867,7 +867,7 @@ Indices: COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -905,7 +905,7 @@ Indices: type: keyword last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -1264,7 +1264,7 @@ Indices: p.sex_at_birth as sex_at_birth, apoc.text.join(Collect(distinct sy.synonym_id), ',') as alternate_participant_id, su.last_known_survival_status as last_known_survival_status, - su.age_at_event_free_survival_status as age_at_event_free_survival_status, + su.age_at_last_known_survival_status as age_at_last_known_survival_status, su.event_free_survival_status as event_free_survival_status, su.first_event as first_event, treatment_filters as treatment_filters, @@ -1294,7 +1294,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -1638,7 +1638,7 @@ Indices: WITH tm, p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{response_category: tr.response_category, age_at_response: tr.age_at_response}) as treatment_response_filters, file, st, stf, stp RETURN DISTINCT @@ -1678,7 +1678,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -2022,7 +2022,7 @@ Indices: WITH tr, p, sy, sample_diagnosis_file_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, file, st, stf, stp @@ -2095,7 +2095,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -2288,7 +2288,7 @@ Indices: WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -2395,7 +2395,7 @@ Indices: WITH dg, p, sm, sample_file_filter, file, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -2482,7 +2482,7 @@ Indices: WITH dg, p, sid, sample_id, sample_file_filter, files, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -2577,7 +2577,7 @@ Indices: st.study_name as study_name, COLLECT(DISTINCT { last_known_survival_status: null, - age_at_event_free_survival_status: null, + age_at_last_known_survival_status: null, event_free_survival_status: null, first_event: null }) AS survival_filters, @@ -2759,7 +2759,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -2895,7 +2895,7 @@ Indices: WITH sm, opensearch_data, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters WITH sm, apoc.map.merge(opensearch_data, { survival_filters: survival_filters }) AS opensearch_data @@ -3033,7 +3033,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -3121,7 +3121,7 @@ Indices: properties: last_known_survival_status: type: keyword - age_at_event_free_survival_status: + age_at_last_known_survival_status: type: integer event_free_survival_status: type: keyword @@ -3267,7 +3267,7 @@ Indices: with file, p, st, sample_diagnosis_filter,COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -3402,7 +3402,7 @@ Indices: })) AS sample_diagnosis_filter_6, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -3585,7 +3585,7 @@ Indices: with file, p, sample_diagnosis_filter, sm1, sm, st, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -3804,7 +3804,7 @@ Indices: with distinct p, sm, st, sample_diagnosis_filter, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -3853,7 +3853,7 @@ Indices: with sm, p, st, dg, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters, @@ -3963,7 +3963,7 @@ Indices: with p, st, dg, COLLECT(DISTINCT {last_known_survival_status: su.last_known_survival_status, event_free_survival_status: su.event_free_survival_status, first_event: su.first_event, - age_at_event_free_survival_status: su.age_at_event_free_survival_status} ) AS survival_filters, + age_at_last_known_survival_status: su.age_at_last_known_survival_status} ) AS survival_filters, COLLECT(DISTINCT{treatment_type: tm.treatment_type, treatment_agent: tm.treatment_agent, age_at_treatment_start: tm.age_at_treatment_start}) as treatment_filters,