Updating the code base for NAR publication

JinmiaoChenLab · Oct 24, 2024 · 75c113d · 75c113d
1 parent 2d7805d
commit 75c113d
Show file tree

Hide file tree

Showing 34 changed files with 1,405 additions and 581 deletions.
diff --git a/.gitignore b/.gitignore
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 -->
 [![Documentation Status](https://readthedocs.org/projects/discotoolkit-py/badge/?version=latest)](https://discotoolkit-py.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://static.pepy.tech/personalized-badge/discotoolkit?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/discotoolkit) [![PyPI version](https://img.shields.io/pypi/v/discotoolkit)](https://pypi.org/project/discotoolkit)
 
-# DISCOtoolkit 1.1.3
+# DISCOtoolkit 1.1.4
 
 DISCOtoolkit is an python package that allows users to access data and use the tools provided by the [DISCO database](https://www.immunesinglecell.org/). Read the documentation [DISCOtoolkit](https://discotoolkit-py.readthedocs.io/en/latest/). It provides the following functions:
 

diff --git a/__pycache__/main.cpython-38.pyc b/__pycache__/main.cpython-38.pyc
diff --git a/__pycache__/setup.cpython-38.pyc b/__pycache__/setup.cpython-38.pyc
diff --git a/__pycache__/test_file.cpython-38.pyc b/__pycache__/test_file.cpython-38.pyc
diff --git a/build/lib/discotoolkit/CELLiD.py b/build/lib/discotoolkit/CELLiD.py
diff --git a/build/lib/discotoolkit/DiscoClass.py b/build/lib/discotoolkit/DiscoClass.py
@@ -6,10 +6,10 @@ class Filter:
     """
     Filter class object to save the attributes for filtering the dataset from DISCO
 
-    sample                      String    e.g. GSM3891625_3;
-    project                     String;
-    tissue                      String    e.g. Lung, Bladder;
-    disease                     String    e.g. PDAC;
+    sample_id                      String    e.g. ERX2757110;
+    project_id                     String;
+    tissue                      String    e.g. lung, bladder;
+    disease                     String    e.g. COVID-19;
     platform                    String    e.g. 10x3';
     sample_type                 String;
     cell_type                   String;
@@ -20,12 +20,12 @@ class Filter:
     return Class object
     """
 
-    def __init__(self, sample = None, project = None, tissue = None, disease = None, platform = None, sample_type = None,
+    def __init__(self, sample_id = None, project_id = None, tissue = None, disease = None, platform = None, sample_type = None,
                  cell_type = None, cell_type_confidence : str = "medium", include_cell_type_children : bool = True, min_cell_per_sample : int = 100):
 
         # handling for string and list input
-        self.sample = self.convert_to_list(sample) # sample id
-        self.project = self.convert_to_list(project) # project, lab, or dataset from different author
+        self.sample_id = self.convert_to_list(sample_id) # sample id
+        self.project_id = self.convert_to_list(project_id) # project, lab, or dataset from different author
         self.tissue = self.convert_to_list(tissue) # organ tissue 
         self.disease = self.convert_to_list(disease) # cancer or non cancer, or COVID-1e9 disease
         self.platform = self.convert_to_list(platform) # sequencing platform

diff --git a/build/lib/discotoolkit/DownloadDiscoData.py b/build/lib/discotoolkit/DownloadDiscoData.py
diff --git a/build/lib/discotoolkit/GetMetadata.py b/build/lib/discotoolkit/GetMetadata.py
diff --git a/build/lib/discotoolkit/GlobalVariable.py b/build/lib/discotoolkit/GlobalVariable.py
@@ -1,3 +1,11 @@
+'''
+Descripttion: 
+version: 
+Author: Mengwei Li
+Date: 2023-07-06 16:59:59
+LastEditors: Mengwei Li
+LastEditTime: 2024-10-23 15:52:29
+'''
 """
 Global variable file to import for the subsequent script.
 
@@ -17,9 +25,9 @@
 timeout = 600
 
 # Define package-level variable
-response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl")
+# response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl")
 
-if response.status_code == 200:
-    prefix_disco_url = json.loads(response.text)["url"]
-else:
-    prefix_disco_url = "http://www.immunesinglecell.org/toolkitapi"
+# if response.status_code == 200:
+#     prefix_disco_url = json.loads(response.text)["url"]
+# else:
+prefix_disco_url = "https://immunesinglecell.org/disco_v3_api/"
diff --git a/build/lib/discotoolkit/Utilities.py b/build/lib/discotoolkit/Utilities.py
@@ -0,0 +1,74 @@
+import h5py
+import numpy as np
+from scipy.sparse import csr_matrix
+from pathlib import Path
+
+
+def write_10X_h5(adata, file):
+    """Writes adata to a 10X-formatted h5 file.
+
+    Note that this function is not fully tested and may not work for all cases.
+    It will not write the following keys to the h5 file compared to 10X:
+    '_all_tag_keys', 'pattern', 'read', 'sequence'
+
+    Args:
+        adata (AnnData object): AnnData object to be written.
+        file (str): File name to be written to. If no extension is given, '.h5' is appended.
+
+    Raises:
+        FileExistsError: If file already exists.
+
+    Returns:
+        None
+    """
+
+    if ".h5" not in file:
+        file = f"{file}.h5"
+    if Path(file).exists():
+        raise FileExistsError(f"There already is a file `{file}`.")
+
+    def int_max(x):
+        return int(max(np.floor(len(str(int(max(x)))) / 4), 1) * 4)
+
+    def str_max(x):
+        return max([len(i) for i in x])
+
+    w = h5py.File(file, "w")
+    grp = w.create_group("matrix")
+    grp.create_dataset(
+        "barcodes",
+        data=np.array(adata.obs_names, dtype=f"|S{str_max(adata.obs_names)}"),
+    )
+    grp.create_dataset(
+        "data", data=np.array(adata.X.data, dtype=f"<i{int_max(adata.X.data)}")
+    )
+    ftrs = grp.create_group("features")
+    # this group will lack the following keys:
+    # '_all_tag_keys', 'feature_type', 'genome', 'id', 'name', 'pattern', 'read', 'sequence'
+    ftrs.create_dataset(
+        "feature_type",
+        data=np.array(
+            adata.var.feature_types, dtype=f"|S{str_max(adata.var.feature_types)}"
+        ),
+    )
+    ftrs.create_dataset(
+        "genome",
+        data=np.array(adata.var.genome, dtype=f"|S{str_max(adata.var.genome)}"),
+    )
+    ftrs.create_dataset(
+        "id",
+        data=np.array(adata.var.gene_ids, dtype=f"|S{str_max(adata.var.gene_ids)}"),
+    )
+    ftrs.create_dataset(
+        "name", data=np.array(adata.var.index, dtype=f"|S{str_max(adata.var.index)}")
+    )
+    grp.create_dataset(
+        "indices", data=np.array(adata.X.indices, dtype=f"<i{int_max(adata.X.indices)}")
+    )
+    grp.create_dataset(
+        "indptr", data=np.array(adata.X.indptr, dtype=f"<i{int_max(adata.X.indptr)}")
+    )
+    grp.create_dataset(
+        "shape",
+        data=np.array(list(adata.X.shape)[::-1], dtype=f"<i{int_max(adata.X.shape)}"),
+    )
diff --git a/build/lib/discotoolkit/__init__.py b/build/lib/discotoolkit/__init__.py
@@ -4,4 +4,4 @@
 from .CELLiD import *
 from .GeneSearch import *
 
-__version__ = "1.1.3"
+__version__ = "1.1.4"