Skip to content

Commit

Permalink
Merge the revision changes
Browse files Browse the repository at this point in the history
  • Loading branch information
f0t1h committed Dec 15, 2023
1 parent 09e30d6 commit 132c173
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 72 deletions.
20 changes: 17 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,28 +48,42 @@ endif

MAIN = $(SRC_PATH)/tksm.cpp
EXEC = $(BIN_PATH)/tksm
MAIN_FILE = tksm.cpp
MAIN_OBJECT = $(OBJ_PATH)/tksm.o

SRC_FILES = tksm.cpp tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp
SRC_FILES = tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp

#Append SRC_PATH to SRC_FILES
SRC_FILES := $(addprefix $(SRC_PATH)/,$(SRC_FILES))

OBJECTS = $(SRC_FILES:$(SRC_PATH)/%.cpp=$(OBJ_PATH)/%.o)

SPLIT_EXECS = $(SRC_FILES:$(SRC_PATH)/%.cpp=$(BIN_PATH)/%.exe)

PCH_HEADER = $(SRC_PATH)/headers.h
PCH_OBJECT = $(PCH_HEADER:$(SRC_PATH)/%.h=$(OBJ_PATH)/%.gch)

PY_FILES = $(wildcard py/*.py)
PY_HEADERS = $(PY_FILES:py/%.py=py_header/%.h)

$(EXEC): $(OBJECTS) $(PY_HEADERS) install.sh $(PCH_OBJECT)
$(EXEC): $(MAIN_OBJECT) $(OBJECTS) $(PY_HEADERS) install.sh $(PCH_OBJECT)
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(PY_HEADER_PATH) -I$(EXTERN_HEADER_PATH) -o $@ $(OBJECTS) $(PY_LDFLAGS) $(LDFLAGS) -include $(PCH_OBJECT)
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(PY_HEADER_PATH) -I$(EXTERN_HEADER_PATH) -o $@ $(MAIN_OBJECT) $(OBJECTS) $(PY_LDFLAGS) $(LDFLAGS) -include $(PCH_OBJECT)

.PHONY: split_binary
split_binary: $(SPLIT_EXECS)

$(BIN_PATH)/%.exe: $(OBJ_PATH)/%_m.o
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(EXTERN_HEADER_PATH) -I$(PY_HEADER_PATH) $(PY_LDFLAGS) $(LDFLAGS) -o $@ $< -DMULTI_BINARY

$(BIN_PATH)/%: $(SRC_PATH)/%.cpp $(PY_HEADERS)
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(PY_HEADER_PATH) -I$(EXTERN_HEADER_PATH) -o $@ $< $(PY_LDFLAGS) $(LDFLAGS)

$(OBJ_PATH)/%_m.o: $(SRC_PATH)/%.cpp $(PY_HEADERS)
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(EXTERN_HEADER_PATH) -I$(PY_HEADER_PATH) -c -o $@ $< -DMULTI_BINARY

$(OBJ_PATH)/%.o: $(SRC_PATH)/%.cpp $(PY_HEADERS)
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(PY_CXXFLAGS) -I$(EXTERN_HEADER_PATH) -I$(PY_HEADER_PATH) -c -o $@ $<
Expand Down
5 changes: 3 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,9 @@ rule sequence:
wildcard_constraints:
exprmnt=exprmnts_re,
shell:
f"{format_gnu_time_string(process='sequence')}"
"{params.model_var} {params.binary} sequence"
"{params.model_var}"
f" {format_gnu_time_string(process='sequence')}"
" {params.binary} sequence"
" -i {input.mdf}"
" --references {params.fastas}"
" -o {output.fastq}"
Expand Down
238 changes: 203 additions & 35 deletions py/truncate_kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
import sys


kde_vals = None

Expand Down Expand Up @@ -39,13 +41,13 @@ def parse_args():
parser.add_argument(
"--grid-start",
type=int,
default=100,
default=0,
help="Read/transcript length start of the KDE grid.",
)
parser.add_argument(
"--grid-end",
type=int,
default=5000,
default=10000,
help="Read/transcript length end of the KDE grid.",
)
parser.add_argument(
Expand All @@ -61,6 +63,24 @@ def parse_args():
default=1,
help="Number of threads to run KDE and GridSearchCV.",
)
parser.add_argument(
"--model-lengths",
default=False,
action="store_true",
help="Model read lengths instead of truncation lengths",
)
parser.add_argument(
"--model-separate",
default=False,
action="store_true",
help="Model read lengths instead of truncation lengths",
)
parser.add_argument(
"--model-3D",
default=False,
action="store_true",
help="Model read lengths instead of truncation lengths",
)

class ListPrinter(argparse.Action):
def __call__(self, parser, namespace, values, option_string):
Expand All @@ -87,6 +107,68 @@ def score_samples_runner(xy):
return xy, kde_vals.score_samples(xy)


def get_truncation_lens_paired_with_transcript_lens_53(paf):
truncation_lengths3 = []
truncation_lengths5 = []
transcript_lens = []
for line in tqdm(open(paf, "r")):
if "tp:A:P" not in line:
continue
line = line.rstrip("\n").split("\t")
strand = line[4]
tlen = int(line[6])
start = int(line[7])
end = int(line[8])

truncation_lengths5.append(start)
truncation_lengths3.append(tlen - end)
transcript_lens.append(tlen)
return truncation_lengths5, truncation_lengths3, transcript_lens


def get_truncation_lens_paired_with_transcript_lens_X4(paf):
truncation_lengths3 = [[], []]
truncation_lengths5 = [[], []]
transcript_lens = [[], []]
for line in tqdm(open(paf, "r")):
if "tp:A:P" not in line:
continue
line = line.rstrip("\n").split("\t")
strand = line[4]
tlen = int(line[6])
start = int(line[7])
end = int(line[8])
idx = 0 if strand == "+" else 1
truncation_lengths5[idx].append(start)
truncation_lengths3[idx].append(tlen - end)
transcript_lens[idx].append(tlen)
return truncation_lengths5, truncation_lengths3, transcript_lens


def get_truncation_lens_paired_with_transcript_lens(paf):
truncation_lengths = list()
transcript_lens = list()
end_ratios = list()
for line in tqdm(open(paf, "r")):
if "tp:A:P" not in line:
continue
line = line.rstrip("\n").split("\t")
strand = line[4]
tlen = int(line[6])
tstart = int(line[7])
tend = int(line[8])
truncation_length = tstart + (tlen - tend)
transcript_lens.append(tlen)
truncation_lengths.append(truncation_length)
if truncation_length > 0:
if strand == "+":
end_truncation = tlen - tend
else:
end_truncation = tstart
end_ratios.append(end_truncation / truncation_length)
return truncation_lengths, transcript_lens, end_ratios


def get_alignment_lens(paf):
tlens = list()
alens = list()
Expand Down Expand Up @@ -128,39 +210,29 @@ def sort_pp(X, Y, p):
return XX, YY, pp


def main():
args = parse_args()
def CV_KDE_bandwidth(len_values, args):
bandwidths = list()
print(
"Non-positive bandwidth selected selected: recomputing bandwidth with GridSearchCV"
)
for _ in range(3):
grid_finder = GridSearchCV(
KernelDensity(),
{"bandwidth": np.arange(50, 1000, 100)},
n_jobs=args.threads,
cv=3,
verbose=3,
)
grid_finder.fit(
len_values[np.random.randint(len_values.shape[0], size=100_000), :]
)
bandwidths.append(grid_finder.best_params_["bandwidth"])
print(bandwidths[-1])
print(bandwidths)
return np.median(bandwidths)

print("Reading {}".format(args.input))

tlens, alens, end_ratios = get_alignment_lens(args.input)
if args.end_ratio != -1:
end_ratios = [args.end_ratio] * len(end_ratios)
with open(f"{args.output}.sider.tsv", "w+") as outfile:
for w, v in zip(*np.histogram(end_ratios, bins=np.arange(0, 1.01, 0.01))):
outfile.write(f"{w:d}\t{v:.5f}\n")
len_values = np.vstack([tlens, alens]).T
if args.bandwidth <= 0:
print(
"Non-positive bandwidth selected selected: recomputing bandwidth with GridSearchCV"
)
bandwidths = list()
for _ in range(3):
grid_finder = GridSearchCV(
KernelDensity(),
{"bandwidth": np.arange(50, 1000, 100)},
n_jobs=args.threads,
cv=3,
verbose=3,
)
grid_finder.fit(
len_values[np.random.randint(len_values.shape[0], size=100_000), :]
)
bandwidths.append(grid_finder.best_params_["bandwidth"])
print(bandwidths[-1])
print(bandwidths)
args.bandwidth = np.median(bandwidths)
print(f"Using bandwidth = {args.bandwidth}")
def ComputeKDELikelihoods(len_values, args):
kd = KernelDensity(bandwidth=args.bandwidth)
global kde_vals
kde_vals = kd.fit(len_values)
Expand Down Expand Up @@ -202,11 +274,107 @@ def main():
X, Y, P = sort_pp(X, Y, P)
P = np.exp(P).reshape(X_idxs.shape[0] - 1, Y_idxs.shape[0] - 1)

return X_idxs, Y_idxs, P


def PrintEndRatios(end_ratios, args):
if args.end_ratio != -1:
end_ratios = [args.end_ratio] * len(end_ratios)
with open(f"{args.output}.sider.tsv", "w+") as outfile:
for w, v in zip(*np.histogram(end_ratios, bins=np.arange(0, 1.01, 0.01))):
outfile.write(f"{w:d}\t{v:.5f}\n")


def main():
args = parse_args()

print("Reading {}".format(args.input))

if args.model_lengths:
print("Modelling read lengths", file=sys.stderr)
tlens, alens, end_ratios = get_alignment_lens(args.input)
len_values = np.vstack([tlens, alens]).T
args.bandwidth = (
args.bandwidth if args.bandwidth > 0 else CV_KDE_bandwidth(len_values, args)
)
PrintEndRatios(end_ratios, args)
X_idxs, Y_idxs, P = ComputeKDELikelihoods(len_values, args)

print("Writing output...")
np.save(f"{args.output}.X_idxs.npy", X_idxs)
np.save(f"{args.output}.Y_idxs.npy", Y_idxs)
np.save(f"{args.output}.grid.npy", P)

elif args.model_separate:
print("Modelling every truncation type")
trc5, trc3, tlens = get_truncation_lens_paired_with_transcript_lens_X4(
args.input
)

args.bandwidth = (
args.bandwidth if args.bandwidth > 0 else CV_KDE_bandwidth(len_values, args)
)

for strand, trunc_lens, transcript_lens in zip(["+", "-"], trc5, tlens):
len_values = np.vstack([transcript_lens, trunc_lens]).T
print(f"Writing output {strand}-5'...")
X_idxs, Y_idxs, P = ComputeKDELikelihoods(len_values, args)
np.save(f"{args.output}.grid{strand}5.npy", P)
for strand, trunc_lens, transcript_lens in zip(["+", "-"], trc3, tlens):
len_values = np.vstack([transcript_lens, trunc_lens]).T
print(f"Writing output {strand}-3'...")
X_idxs, Y_idxs, P = ComputeKDELikelihoods(len_values, args)
np.save(f"{args.output}.grid{strand}3.npy", P)

np.save(f"{args.output}.X_idxs.npy", X_idxs)
np.save(f"{args.output}.Y_idxs.npy", Y_idxs)
elif args.model_3D:
print("Modelling 3D truncation type")
trc5, trc3, tlens = get_truncation_lens_paired_with_transcript_lens_53(
args.input
)
bins, edges = np.histogramdd(
(trc5, trc3, tlens),
bins=args.grid_step,
density=True,
range=(
(args.grid_start, args.grid_end),
(args.grid_start, args.grid_end),
(args.grid_start, args.grid_end),
),
)
np.save(f"{args.output}.grid.npy", bins)
np.save(f"{args.output}.edges.npy", edges)
else:
print("Modelling truncation lengths", file=sys.stderr)
tlens, alens, end_ratios = get_truncation_lens_paired_with_transcript_lens(
args.input
)
len_values = np.vstack([tlens, alens]).T
args.bandwidth = (
args.bandwidth if args.bandwidth > 0 else CV_KDE_bandwidth(len_values, args)
)
PrintEndRatios(end_ratios, args)
X_idxs, Y_idxs, P = ComputeKDELikelihoods(len_values, args)
print("Writing output...")
np.save(f"{args.output}.X_idxs.npy", X_idxs)
np.save(f"{args.output}.Y_idxs.npy", Y_idxs)
np.save(f"{args.output}.grid.npy", np.transpose(P))


"""
print(f"Using bandwidth = {args.bandwidth}")
X_idxs, Y_idxs, P = ComputeKDELikelihoods(len_values, args)
print("Writing output...")
np.save(f"{args.output}.X_idxs.npy", X_idxs)
np.save(f"{args.output}.Y_idxs.npy", Y_idxs)
np.save(f"{args.output}.grid.npy", P)

if args.model_lengths:
np.save(f"{args.output}.grid.npy", P)
else:
np.save(f"{args.output}.grid.npy", np.transpose(P))
"""

if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion src/pimpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@
#define _PIMPL_H
#include <memory>

#define MODULE_IMPLEMENT_PIMPL_CLASS(CLASS_NAME) \
#define MODULE_IMP_PIMPL(CLASS_NAME) \
CLASS_NAME::CLASS_NAME(int argc, char **argv) : pimpl{std::make_unique<impl>(argc, argv)} {} \
CLASS_NAME::~CLASS_NAME() = default; \
int CLASS_NAME::run() { return pimpl->run(); }

#ifndef MULTI_BINARY
#define MODULE_IMPLEMENT_PIMPL_CLASS(CLASS_NAME) MODULE_IMP_PIMPL(CLASS_NAME)
#else
#define MODULE_IMPLEMENT_PIMPL_CLASS(CLASS_NAME) \
MODULE_IMP_PIMPL(CLASS_NAME) \
int main(int argc, char **argv) { return CLASS_NAME{argc, argv}.run(); }
#endif

#define MODULE_DECLARE_PIMPL_CLASS(CLASS_NAME) \
class CLASS_NAME { \
class impl; \
Expand Down
Loading

0 comments on commit 132c173

Please sign in to comment.