diff --git a/SemiBin/cluster.py b/SemiBin/cluster.py index dc9759a..167a00b 100644 --- a/SemiBin/cluster.py +++ b/SemiBin/cluster.py @@ -102,11 +102,10 @@ def run_embed_infomap(logger, model, data, * , if is_combined: if norm_abundance(train_data_input): - train_data_kmer = train_data_input[:, 0:136] - train_data_depth = train_data_input[:, 136:len(data.values[0])] from sklearn.preprocessing import normalize - train_data_depth = normalize(train_data_depth, axis=1, norm='l1') - train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1) + norm = np.sum(train_data_input, axis=0) + train_data_input = train_data_input / norm + train_data_input = normalize(train_data_input, axis=1, norm='l1') depth = data.values[:, 136:len(data.values[0])].astype(np.float32) num_contigs = train_data_input.shape[0] diff --git a/SemiBin/long_read_cluster.py b/SemiBin/long_read_cluster.py index adafe85..67f49d8 100644 --- a/SemiBin/long_read_cluster.py +++ b/SemiBin/long_read_cluster.py @@ -57,11 +57,10 @@ def cluster_long_read(logger, model, data, device, is_combined, else: train_data_input = data.values if norm_abundance(train_data_input): - train_data_kmer = train_data_input[:, 0:136] - train_data_depth = train_data_input[:, 136:len(data.values[0])] from sklearn.preprocessing import normalize - train_data_depth = normalize(train_data_depth, axis=1, norm='l1') - train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1) + norm = np.sum(train_data_input, axis=0) + train_data_input = train_data_input / norm + train_data_input = normalize(train_data_input, axis=1, norm='l1') with torch.no_grad(): model.eval() diff --git a/SemiBin/main.py b/SemiBin/main.py index 491846b..8613704 100644 --- a/SemiBin/main.py +++ b/SemiBin/main.py @@ -286,8 +286,9 @@ def parse_args(args, is_semibin2): help='Path to the input data.csv file.', dest='data', default=None,) + if p in [multi_easy_bin, generate_sequence_features_multi]: - m.add_argument('-b', '--input-bam', + p.add_argument('-b', '--input-bam', required=False, nargs='*', help='Path to the input BAM(.bam)/CRAM(.cram) file(s). ' @@ -796,7 +797,6 @@ def generate_sequence_features_single(logger, contig_fasta, logger.info('We will only calculate k-mer features.') if not only_kmer: - logger.debug('Start generating kmer features from fasta file.') kmer_whole = generate_kmer_features_from_fasta( contig_fasta, binned_length, 4) @@ -1201,10 +1201,10 @@ def binning_short(logger, data, minfasta, model_path: path to the trained model """ from .cluster import cluster + import pandas as pd logger.info('Start binning.') is_combined, n_sample, data, model = binning_preprocess(data, getattr(args, 'depth_metabat2', None), model_path, environment, device) - cluster( logger, model=model, diff --git a/SemiBin/self_supervised_model.py b/SemiBin/self_supervised_model.py index 24441dd..75dfc2b 100644 --- a/SemiBin/self_supervised_model.py +++ b/SemiBin/self_supervised_model.py @@ -29,7 +29,6 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True, """ from tqdm import tqdm import pandas as pd - from sklearn.preprocessing import normalize import numpy as np train_data = pd.read_csv(datapaths[0], index_col=0).values @@ -57,6 +56,7 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True, logger.debug(f'Reading training data for index {data_index}...') data = pd.read_csv(datapath, index_col=0) + data.index = data.index.astype(str) data_split = pd.read_csv(data_split_path, index_col=0) if mode == 'several': @@ -67,24 +67,22 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True, train_data = data.values train_data_split = data_split.values - + n_must_link = len(train_data_split) if not is_combined: train_data = train_data[:, :136] else: if norm_abundance(train_data): - train_data_kmer = train_data[:, :136] - train_data_depth = train_data[:, 136:] - train_data_depth = normalize(train_data_depth, axis=1, norm='l1') - train_data = np.concatenate((train_data_kmer, train_data_depth), axis=1) + from sklearn.preprocessing import normalize + norm = np.sum(train_data, axis=0) + train_data = train_data / norm + train_data_split = train_data_split / norm + train_data = normalize(train_data, axis=1, norm='l1') + train_data_split = normalize(train_data_split, axis=1, norm='l1') - train_data_split_kmer = train_data_split[:, :136] - train_data_split_depth = train_data_split[:, 136:] - train_data_split_depth = normalize(train_data_split_depth, axis=1, norm='l1') - train_data_split = np.concatenate((train_data_split_kmer, train_data_split_depth), axis = 1) data_length = len(train_data) # cannot link data is sampled randomly - n_cannot_link = min(len(train_data_split) * 1000 // 2, 4_000_000) + n_cannot_link = min(n_must_link * 1000 // 2, 4_000_000) indices1 = np.random.choice(data_length, size=n_cannot_link) indices2 = indices1 + 1 + np.random.choice(data_length - 1, size=n_cannot_link) @@ -126,6 +124,7 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True, supervised_loss = supervised_loss.to(device) supervised_loss.backward() optimizer.step() + scheduler.step() logger.info('Training finished.') diff --git a/SemiBin/semi_supervised_model.py b/SemiBin/semi_supervised_model.py index 415f0ae..857a5b4 100644 --- a/SemiBin/semi_supervised_model.py +++ b/SemiBin/semi_supervised_model.py @@ -217,15 +217,13 @@ def train(logger, out, contig_fastas, binned_lengths, datas, data_splits, cannot train_data_split_input = train_data_must_link else: if norm_abundance(train_data): - train_data_kmer = train_data[:, :136] - train_data_depth = train_data[:, 136:] - train_data_depth = normalize(train_data_depth, axis=1, norm='l1') - train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1) - - train_data_split_kmer = train_data_must_link[:, :136] - train_data_split_depth = train_data_must_link[:, 136:] - train_data_split_depth = normalize(train_data_split_depth, axis=1, norm='l1') - train_data_split_input = np.concatenate((train_data_split_kmer, train_data_split_depth), axis = 1) + from sklearn.preprocessing import normalize + norm = np.sum(train_data, axis=0) + train_data = train_data / norm + train_data_must_link = train_data_must_link / norm + train_data_input = normalize(train_data, axis=1, norm='l1') + train_data_split_input = normalize(train_data_must_link, axis=1, norm='l1') + else: train_data_input = train_data train_data_split_input = train_data_must_link @@ -314,6 +312,7 @@ def train(logger, out, contig_fastas, binned_lengths, datas, data_splits, cannot decoder2.double(), is_label=False).to(device) loss.backward() optimizer.step() + scheduler.step() logger.info('Training finished.')