separation of table function into two portions

saezlab · Sep 26, 2024 · a423239 · a423239
1 parent 44bb9d6
commit a423239
Showing 1 changed file with 33 additions and 2 deletions.
diff --git a/pypath/inputs/hmdb/common.py b/pypath/inputs/hmdb/common.py
@@ -92,7 +92,7 @@ def raw(
     return result
 
 
-def table(
+def processed(
         *fields: str | tuple,
         dataset: Literal['metabolites', 'proteins'],
         head: int | None = None,
@@ -122,7 +122,6 @@ def table(
             Process the first N records only. Useful for peeking into
             the data.
     """
-
     SCHEMA = globals()[f'{dataset.upper()}_SCHEMA']
     fields = [Field(d[0], *d) for d in (common.to_tuple(f) for f in fields)]
     fields.extend(Field(n, *f) for n, f in named_fields.items())
@@ -142,6 +141,38 @@ def table(
         cols, rows = zip(*(f.process(record) for f in fields))
         columns = columns or list(itertools.chain(*cols))
         data.extend(itertools.product(*itertools.chain(*rows)))
+
+
+def table(
+        *fields: str | tuple,
+        dataset: Literal['metabolites', 'proteins'],
+        head: int | None = None,
+        **named_fields: str | tuple,
+    ) -> pd.DataFrame:
+    """
+    Parse various simple and nested array fields from HMDB into data frame.
+
+    Args:
+        fields:
+            Fields to include in the data frame. These must be keys in the
+            schema, and will be also used as column names. Alternatively,
+            tuples of sequetial processing steps can be provided: strings
+            will be used as keys in nested dicts, tuples will be used as
+            multiple keys in dicts, each yielding a separate column, the
+            special symbol "*" means all keys in the sub-dict, while "@"
+            means expand arrays into multiple rows. Be careful with this
+            latter option because it is applied in a combinatorial way, i.e.
+            in case of expanding an array to 5 rowns, and another one to 7
+            rows results already 35 rows from a single record. This might
+            result excessive memory use and processing time.
+        named_fields:
+            Same as `fields`, but the column name can be different from the
+            top level key: argument names will be used as column names,
+            values will be used as processing steps.
+        head:
+            Process the first N records only. Useful for peeking into
+            the data.
+    """
 
         if (i + 1) % 100 == 0: