Merge branch 'main' into patch-1

EGA-archive · Nov 11, 2024 · 5f0dd4b · 5f0dd4b
2 parents 2b502cd + 162a1bc
commit 5f0dd4b
Show file tree

Hide file tree

Showing 15 changed files with 231 additions and 1,029 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,4 @@
 files/vcf/*
-files/vcf/files_to_read/*
+files/vcf/files_to_read/*
+*.vcf.gz
+*.vcf.bgz
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ scripts/datasheet/conf/__pycache__/*
 *.vcf.gz
 *.vcf
 *.vcf.gz.tbi
+*.vcf.bgz.tbi
 vcf_BelCovid_2
diff --git a/Dockerfile b/Dockerfile
@@ -11,9 +11,10 @@ RUN apt-get update
 RUN apt-get install -y --no-install-recommends \
     ca-certificates pkg-config make \
     libssl-dev libffi-dev libpq-dev
+RUN apt install vim -y
 COPY . .
 COPY ./requirements.txt .
 RUN pip install --upgrade pip
 RUN pip install wheel setuptools pip --upgrade
-RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/README.md b/README.md
@@ -34,13 +34,17 @@ Once the container is up and running you can start using beacon ri tools v2, con
 To start using beacon ri tools v2, you have to edit the configuration file [conf.py](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/conf/conf.py) that you will find inside [conf](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/conf). Inside this file you will find the next information:
 ```bash
 #### Input and Output files config parameters ####
-csv_folder='csv/examples/'
-output_docs_folder='output_docs/CINECA_dataset/'
+csv_folder = './csv/examples/'
+output_docs_folder='./output_docs/'
 
 #### VCF Conversion config parameters ####
-reference_genome='GRCh37' # Choose one between NCBI36, GRCh37, GRCh38
+allele_frequency=1 # introduce float number, leave 1 if you want to convert all the variants
+reference_genome='GRCh38' # Choose one between NCBI36, GRCh37, GRCh38
+datasetId='coadread_tcga_pan_can_atlas_2018'
 ```
 
+Please, remember to make the datasetId match the id for your datasets.csv file.
+
 #### Generic config parameters
 The **csv_folder** variable sets where are all the .csv files the tool will work with. All the .csv files must follow a specific header structure. You can find an example here [templates](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/csv/templates). Note that any header with different column names from the ones that appear inside the files of this folder will not be read by the beacon ri tools v2.
 The **output_docs_folder** sets the folder where your final .json files will be saved once execution of beacon tools finishes.  This folder should always be located within 'output_docs', and the only part of the path that can be altered is the subdirectory of 'output_docs'.

diff --git a/analyses_csv.py b/analyses_csv.py
@@ -348,6 +348,7 @@ def generate(dict_properties, list_of_headers):
                         if propk == new_item:
                             definitivedict[key]=propv
             Analyses(**definitivedict)
+            definitivedict["datasetId"]=conf.datasetId
             total_dict.append(definitivedict)
 
 

diff --git a/biosamples_csv.py b/biosamples_csv.py
@@ -341,6 +341,7 @@ def generate(dict_properties, list_of_headers):
                         if propk == new_item:
                             definitivedict[key]=propv
             Biosamples(**definitivedict)
+            definitivedict["datasetId"]=conf.datasetId
             total_dict.append(definitivedict)
 
 

diff --git a/cohorts_csv.py b/cohorts_csv.py
@@ -350,6 +350,7 @@ def generate(dict_properties, list_of_headers):
                                 propv = propv
                             definitivedict[key]=propv
             Cohorts(**definitivedict)
+            definitivedict["datasetId"]=conf.datasetId
             total_dict.append(definitivedict)
 
 

diff --git a/conf/conf.py b/conf/conf.py
@@ -3,8 +3,11 @@
 output_docs_folder='./output_docs/'
 
 #### VCF Conversion config parameters ####
-allele_frequency=1 # introduce float number, leave 1 if you want to convert all the variants
-reference_genome='GRCh38' # Choose one between NCBI36, GRCh37, GRCh38
+allele_counts=True
+reference_genome='GRCh37' # Choose one between NCBI36, GRCh37, GRCh38
+datasetId='COVID_pop11_fin_2'
+case_level_data=False
+num_rows=7000000
 
 ### MongoDB parameters ###
 database_host = 'mongo'

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -22,6 +22,7 @@ services:
   db:
     container_name: ri-tools-mongo
     image: mongo:5
+    command: --wiredTigerCacheSizeGB 4
     hostname: mongo
     ports:
       - 27017:27017

diff --git a/genomicVariations.json b/genomicVariations.json
diff --git a/genomicVariations_csv.py b/genomicVariations_csv.py
@@ -511,6 +511,7 @@ def generate(dict_properties,list_of_headers):
                         if propk == new_item:
                             definitivedict[key]=propv
             GenomicVariations(**definitivedict)
+            definitivedict["datasetId"]=conf.datasetId
             total_dict.append(definitivedict)