Skip to content

Commit

Permalink
update normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
psj1997 committed Jul 31, 2024
1 parent 59c6e35 commit ba71201
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 31 deletions.
7 changes: 3 additions & 4 deletions SemiBin/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,10 @@ def run_embed_infomap(logger, model, data, * ,

if is_combined:
if norm_abundance(train_data_input):
train_data_kmer = train_data_input[:, 0:136]
train_data_depth = train_data_input[:, 136:len(data.values[0])]
from sklearn.preprocessing import normalize
train_data_depth = normalize(train_data_depth, axis=1, norm='l1')
train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1)
norm = np.sum(train_data_input, axis=0)
train_data_input = train_data_input / norm
train_data_input = normalize(train_data_input, axis=1, norm='l1')

depth = data.values[:, 136:len(data.values[0])].astype(np.float32)
num_contigs = train_data_input.shape[0]
Expand Down
7 changes: 3 additions & 4 deletions SemiBin/long_read_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,10 @@ def cluster_long_read(logger, model, data, device, is_combined,
else:
train_data_input = data.values
if norm_abundance(train_data_input):
train_data_kmer = train_data_input[:, 0:136]
train_data_depth = train_data_input[:, 136:len(data.values[0])]
from sklearn.preprocessing import normalize
train_data_depth = normalize(train_data_depth, axis=1, norm='l1')
train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1)
norm = np.sum(train_data_input, axis=0)
train_data_input = train_data_input / norm
train_data_input = normalize(train_data_input, axis=1, norm='l1')

with torch.no_grad():
model.eval()
Expand Down
6 changes: 3 additions & 3 deletions SemiBin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,9 @@ def parse_args(args, is_semibin2):
help='Path to the input data.csv file.',
dest='data',
default=None,)

if p in [multi_easy_bin, generate_sequence_features_multi]:
m.add_argument('-b', '--input-bam',
p.add_argument('-b', '--input-bam',
required=False,
nargs='*',
help='Path to the input BAM(.bam)/CRAM(.cram) file(s). '
Expand Down Expand Up @@ -796,7 +797,6 @@ def generate_sequence_features_single(logger, contig_fasta,
logger.info('We will only calculate k-mer features.')

if not only_kmer:

logger.debug('Start generating kmer features from fasta file.')
kmer_whole = generate_kmer_features_from_fasta(
contig_fasta, binned_length, 4)
Expand Down Expand Up @@ -1201,10 +1201,10 @@ def binning_short(logger, data, minfasta,
model_path: path to the trained model
"""
from .cluster import cluster
import pandas as pd
logger.info('Start binning.')

is_combined, n_sample, data, model = binning_preprocess(data, getattr(args, 'depth_metabat2', None), model_path, environment, device)

cluster(
logger,
model=model,
Expand Down
21 changes: 10 additions & 11 deletions SemiBin/self_supervised_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True,
"""
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np

train_data = pd.read_csv(datapaths[0], index_col=0).values
Expand Down Expand Up @@ -57,6 +56,7 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True,
logger.debug(f'Reading training data for index {data_index}...')

data = pd.read_csv(datapath, index_col=0)
data.index = data.index.astype(str)
data_split = pd.read_csv(data_split_path, index_col=0)

if mode == 'several':
Expand All @@ -67,24 +67,22 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True,

train_data = data.values
train_data_split = data_split.values

n_must_link = len(train_data_split)
if not is_combined:
train_data = train_data[:, :136]
else:
if norm_abundance(train_data):
train_data_kmer = train_data[:, :136]
train_data_depth = train_data[:, 136:]
train_data_depth = normalize(train_data_depth, axis=1, norm='l1')
train_data = np.concatenate((train_data_kmer, train_data_depth), axis=1)
from sklearn.preprocessing import normalize
norm = np.sum(train_data, axis=0)
train_data = train_data / norm
train_data_split = train_data_split / norm
train_data = normalize(train_data, axis=1, norm='l1')
train_data_split = normalize(train_data_split, axis=1, norm='l1')

train_data_split_kmer = train_data_split[:, :136]
train_data_split_depth = train_data_split[:, 136:]
train_data_split_depth = normalize(train_data_split_depth, axis=1, norm='l1')
train_data_split = np.concatenate((train_data_split_kmer, train_data_split_depth), axis = 1)

data_length = len(train_data)
# cannot link data is sampled randomly
n_cannot_link = min(len(train_data_split) * 1000 // 2, 4_000_000)
n_cannot_link = min(n_must_link * 1000 // 2, 4_000_000)
indices1 = np.random.choice(data_length, size=n_cannot_link)
indices2 = indices1 + 1 + np.random.choice(data_length - 1,
size=n_cannot_link)
Expand Down Expand Up @@ -126,6 +124,7 @@ def train_self(logger, out : str, datapaths, data_splits, is_combined=True,
supervised_loss = supervised_loss.to(device)
supervised_loss.backward()
optimizer.step()

scheduler.step()

logger.info('Training finished.')
Expand Down
17 changes: 8 additions & 9 deletions SemiBin/semi_supervised_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,13 @@ def train(logger, out, contig_fastas, binned_lengths, datas, data_splits, cannot
train_data_split_input = train_data_must_link
else:
if norm_abundance(train_data):
train_data_kmer = train_data[:, :136]
train_data_depth = train_data[:, 136:]
train_data_depth = normalize(train_data_depth, axis=1, norm='l1')
train_data_input = np.concatenate((train_data_kmer, train_data_depth), axis=1)

train_data_split_kmer = train_data_must_link[:, :136]
train_data_split_depth = train_data_must_link[:, 136:]
train_data_split_depth = normalize(train_data_split_depth, axis=1, norm='l1')
train_data_split_input = np.concatenate((train_data_split_kmer, train_data_split_depth), axis = 1)
from sklearn.preprocessing import normalize
norm = np.sum(train_data, axis=0)
train_data = train_data / norm
train_data_must_link = train_data_must_link / norm
train_data_input = normalize(train_data, axis=1, norm='l1')
train_data_split_input = normalize(train_data_must_link, axis=1, norm='l1')

else:
train_data_input = train_data
train_data_split_input = train_data_must_link
Expand Down Expand Up @@ -314,6 +312,7 @@ def train(logger, out, contig_fastas, binned_lengths, datas, data_splits, cannot
decoder2.double(), is_label=False).to(device)
loss.backward()
optimizer.step()

scheduler.step()

logger.info('Training finished.')
Expand Down

0 comments on commit ba71201

Please sign in to comment.