Skip to content

Commit

Permalink
added sample id instead of barcode handling, extended sample sheet pa…
Browse files Browse the repository at this point in the history
…rsing, fixed function name typo
  • Loading branch information
AmandaBirmingham committed Sep 26, 2024
1 parent d73af00 commit bcd8d46
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 18 deletions.
15 changes: 11 additions & 4 deletions q2_surpi/_formats_and_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
FAMILY_KEY = "family"
TAG_KEY = "tag"
SAMPLE_NAME_KEY = 'Sample_Name'
SS_SAMPLE_ID_KEY = "Sample_ID"
INDEX_1_KEY = "index"
INDEX_2_KEY = "index2"
BARCODE_KEY = 'barcode'
Expand Down Expand Up @@ -46,10 +47,10 @@ class SurpiSampleSheetFormat(model.TextFileFormat):
"""Represents a csv-delimited sample sheet file used by SURPI+."""

def _validate_(self, level):
_ = surpi_count_fp_to_df(self.path)
_ = surpi_sample_sheet_fp_to_df(self.path)


def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame:
def surpi_sample_sheet_fp_to_df(fp: str) -> pandas.DataFrame:
# open the file and count each line until we find one that starts with
# [Data]

Expand All @@ -62,8 +63,14 @@ def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame:
continue
# endif line.startswith("[Data]")

if is_data and not line.startswith(','):
data_table_lines.append(line)
if is_data:
if line.startswith("["):
# if we've reached the beginning of the next section, stop
break

if not line.startswith(','):
# add non-empty lines to the data table
data_table_lines.append(line)
# endif is_data and not line.startswith(',')
# endfor line in f
# endwith self.path.open("r") as f
Expand Down
25 changes: 15 additions & 10 deletions q2_surpi/_plugin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas
from q2_surpi._formats_and_types import FEATURE_ID_KEY, FAMILY_KEY, \
GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY
GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY, \
SS_SAMPLE_ID_KEY

SAMPLE_ID_KEY = 'sample-id'
TAXON_KEY = 'Taxon'
Expand All @@ -15,7 +16,8 @@
# automagically and this will receive pandas.DataFrames as its arguments.
def extract(
surpi_output: pandas.DataFrame,
surpi_sample_info: pandas.DataFrame) -> \
surpi_sample_info: pandas.DataFrame,
ids_are_barcodes: bool = True) -> \
(pandas.DataFrame, pandas.DataFrame):

"""Turn SURPI data into a feature table dataframe and a taxonomy dataframe.
Expand All @@ -26,6 +28,9 @@ def extract(
A DataFrame containing the content of a SURPI counttable [sic] file.
surpi_sample_info_df : pandas.DataFrame
A DataFrame containing the content of a SURPI sample sheet file.
ids_are_barcodes : bool, optional
True if the sample ids are barcodes. False if the sample ids are
sample sheet sample ids. Default is True.
Returns
-------
Expand All @@ -37,6 +42,8 @@ def extract(
the QIIME 2 taxonomy format.
"""

ss_sample_id_key = BARCODE_KEY if ids_are_barcodes else SS_SAMPLE_ID_KEY

# Generate the taxonomy result
taxonomy = surpi_output[[SPECIES_KEY, GENUS_KEY, FAMILY_KEY]].copy()
taxonomy[TAXON_KEY] = surpi_output.apply(
Expand All @@ -56,26 +63,24 @@ def extract(
surpi_feature_table_df[FEATURE_ID_KEY] = taxonomy.index
surpi_feature_table_df = surpi_feature_table_df.set_index(FEATURE_ID_KEY)
surpi_feature_table_df = surpi_feature_table_df.T
surpi_feature_table_df.index.name = BARCODE_KEY
surpi_feature_table_df.index.name = ss_sample_id_key
surpi_feature_table_df = surpi_feature_table_df.reset_index()
feature_barcodes = surpi_feature_table_df[BARCODE_KEY].unique()
feature_barcodes = surpi_feature_table_df[ss_sample_id_key].unique()

# merge the sample info with the feature table
# TODO: this is speculative code and may need to be adjusted; I don't
# know yet what the sample info looks like
limited_sample_info_df = \
surpi_sample_info[[BARCODE_KEY, SAMPLE_NAME_KEY]]
surpi_sample_info[[ss_sample_id_key, SAMPLE_NAME_KEY]]
surpi_feature_table_df = surpi_feature_table_df.merge(
limited_sample_info_df, on=BARCODE_KEY, how='inner',
limited_sample_info_df, on=ss_sample_id_key, how='inner',
validate='one_to_one')
identified_barcodes = surpi_feature_table_df[BARCODE_KEY].unique()
identified_barcodes = surpi_feature_table_df[ss_sample_id_key].unique()
unidentified_barcodes = set(feature_barcodes) - set(identified_barcodes)
if len(unidentified_barcodes) > 0:
raise ValueError(
f"The following barcodes were not linked to sample identifiers "
f"in the sample sheet: {unidentified_barcodes}")

surpi_feature_table_df.drop(columns=[BARCODE_KEY], inplace=True)
surpi_feature_table_df.drop(columns=[ss_sample_id_key], inplace=True)
surpi_feature_table_df.set_index(SAMPLE_NAME_KEY, inplace=True)
surpi_feature_table_df.index.name = SAMPLE_ID_KEY

Expand Down
12 changes: 8 additions & 4 deletions q2_surpi/plugin_setup.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import pandas
from q2_types.feature_table import FeatureTable, Frequency
from q2_types.feature_data import FeatureData, Taxonomy
from qiime2.plugin import (Plugin, Citations)
from qiime2.plugin import (Plugin, Citations, Bool)
import q2_surpi
from q2_surpi._formats_and_types import (
SurpiCountTable, SurpiCountTableFormat, SurpiCountTableDirectoryFormat,
SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat,
surpi_count_fp_to_df)
surpi_sample_sheet_fp_to_df)


plugin = Plugin(
Expand Down Expand Up @@ -42,7 +42,7 @@ def _1(ff: SurpiCountTableFormat) -> pandas.DataFrame:
@plugin.register_transformer
# load a SurpiSampleSheetFormat into a dataframe
def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame:
result = surpi_count_fp_to_df(str(ff))
result = surpi_sample_sheet_fp_to_df(str(ff))
return result


Expand Down Expand Up @@ -70,7 +70,11 @@ def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame:
input_descriptions={
'surpi_output': "SURPI counts per species per barcode.",
'surpi_sample_info': 'Info linking sample ids to barcodes.'},
parameters={},
parameters={'ids_are_barcodes': Bool},
parameter_descriptions={
'ids_are_barcodes': ("True if the sample ids in the count tables are "
"barcodes. False if they are the sample sheet's "
"sample ids. Default is True.")},
outputs=[('table', FeatureTable[Frequency]),
('taxonomy', FeatureData[Taxonomy])],
output_descriptions={
Expand Down

0 comments on commit bcd8d46

Please sign in to comment.