Skip to content

Commit

Permalink
Esvee: write assembly link frag counts
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesshale committed Mar 31, 2024
1 parent 6577820 commit e2ea2fd
Show file tree
Hide file tree
Showing 11 changed files with 370 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ public class AssemblyConfig

public final int AssemblyRefBaseWriteMax;
public final boolean SkipDiscordant;
public final boolean LogPhaseGroupLinks;
public final int PhaseProcessingLimit;

public final int Threads;
Expand Down Expand Up @@ -255,7 +254,6 @@ public AssemblyConfig(final ConfigBuilder configBuilder)
PhaseProcessingLimit = configBuilder.getInteger(PHASE_PROCESSING_LIMIT);

AssemblyRefBaseWriteMax = configBuilder.getInteger(ASSEMBLY_REF_BASE_WRITE_MAX);
LogPhaseGroupLinks = configBuilder.hasFlag(LOG_PHASE_GROUP_LINKS);

Threads = parseThreads(configBuilder);

Expand Down Expand Up @@ -395,7 +393,6 @@ public AssemblyConfig()

AssemblyRefBaseWriteMax = 0;
SkipDiscordant = true;
LogPhaseGroupLinks = false;
PhaseProcessingLimit = 0;
Threads = 0;
TruthsetFile = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,12 @@ private void formPhaseGroups()

addPerfCounters(phaseSetTasks.stream().collect(Collectors.toList()));

SV_LOGGER.info("created {} phase sets", phaseGroups.stream().mapToInt(x -> x.phaseSets().size()).sum());
int totalRemoteReadSearch = phaseSetTasks.stream().mapToInt(x -> x.totalRemoteReadsSearch()).sum();
int totalRemoteReadMatched = phaseSetTasks.stream().mapToInt(x -> x.totalRemoteReadsMatched()).sum();

SV_LOGGER.info("created {} phase sets, remote read(search={} matched={})",
phaseGroups.stream().mapToInt(x -> x.phaseSets().size()).sum(),
totalRemoteReadSearch, totalRemoteReadMatched);
}

private void addPerfCounters(final List<ThreadTask> tasks)
Expand Down Expand Up @@ -279,6 +284,9 @@ private List<JunctionAssembly> prepareFinalAssemblies()
{
List<JunctionAssembly> allAssemblies = Lists.newArrayList();

// assemblies are source from junction groups
// they could instead be sourced from phase groups, to automatically collect any branched or ref-remote assemblies

int assemblyId = 0;
for(List<JunctionGroup> junctionGroups : mJunctionGroupMap.values())
{
Expand All @@ -303,28 +311,23 @@ private List<JunctionAssembly> prepareFinalAssemblies()
setAssemblyOutcome(branchedAssembly);
}

if(assembly.phaseGroup() != null)
PhaseSet phaseSet = assembly.phaseSet();
List<AssemblyLink> assemblyLinks = phaseSet != null ? phaseSet.findAssemblyLinks(assembly) : Collections.emptyList();

for(AssemblyLink splitLink : assemblyLinks)
{
PhaseSet phaseSet = assembly.phaseGroup().findPhaseSet(assembly);
List<AssemblyLink> assemblyLinks = phaseSet != null ? phaseSet.findAssemblyLinks(assembly) : Collections.emptyList();
JunctionAssembly otherAssembly = splitLink.otherAssembly(assembly);

for(AssemblyLink splitLink : assemblyLinks)
if(otherAssembly.outcome() == REMOTE_REF)
{
JunctionAssembly otherAssembly = splitLink.otherAssembly(assembly);

if(otherAssembly.outcome() == REMOTE_REF)
{
otherAssembly.setId(assemblyId++);
allAssemblies.add(otherAssembly);
}
otherAssembly.setId(assemblyId++);
allAssemblies.add(otherAssembly);
}
}
}
}
}



// TODO: assembly ID could be set once assemblies have been ordered, and a unique assebly string ID also set in a
// fairly deterministic way based on final coords, with any duplicates given an extra incrementor

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ public class RemoteRegionAssembler
private final Map<String,Read> mSourceReads;
private final List<Read> mMatchedRemoteReads;

private int mTotalRemoteReadsSearch;
private int mTotalRemoteReadsMatched;

public RemoteRegionAssembler(final AssemblyConfig config, final BamReader bamReader)
{
mConfig = config;
Expand All @@ -60,8 +63,14 @@ public RemoteRegionAssembler(final AssemblyConfig config, final BamReader bamRea
mRemoteRegion = null;
mSourceReads = Maps.newHashMap();
mMatchedRemoteReads = Lists.newArrayList();

mTotalRemoteReadsSearch = 0;
mTotalRemoteReadsMatched = 0;
}

public int totalRemoteReadsSearch() { return mTotalRemoteReadsSearch; }
public int totalRemoteReadsMatched() { return mTotalRemoteReadsMatched; }

public static boolean isExtensionCandidateAssembly(final JunctionAssembly assembly)
{
if(assembly.refBaseTrimLength() < MIN_VARIANT_LENGTH)
Expand Down Expand Up @@ -135,13 +144,19 @@ public AssemblyLink tryRemoteAssemblyLink(final JunctionAssembly assembly, final
SV_LOGGER.trace("remote region({}) slice", mRemoteRegion);

mMatchedRemoteReads.clear();
mSourceReads.clear();

mTotalRemoteReadsSearch += sourceReads.size();

sourceReads.forEach(x -> mSourceReads.put(x.id(), x));

mBamReader.sliceBam(mRemoteRegion.Chromosome, mRemoteRegion.start(), mRemoteRegion.end(), this::processRecord);

SV_LOGGER.debug("remote region({}) sourcedReads(matched={} unmatched={})",
mRemoteRegion, mMatchedRemoteReads.size(), mSourceReads.size());

mTotalRemoteReadsMatched += mMatchedRemoteReads.size();

if(mMatchedRemoteReads.size() < PRIMARY_ASSEMBLY_MIN_READ_SUPPORT)
return null;

Expand All @@ -156,7 +171,7 @@ public AssemblyLink tryRemoteAssemblyLink(final JunctionAssembly assembly, final
if(assemblyLink == null)
return null;

SV_LOGGER.debug("assembly({}) links with remote region({}) matchedReads({})",
SV_LOGGER.trace("assembly({}) links with remote region({}) matchedReads({})",
assembly, remoteRegion.toString(), mMatchedRemoteReads.size());

return assemblyLink;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,7 @@ private BufferedWriter initialiseWriter()

sj.add("RepeatInfo");
sj.add("RefSideSoftClips");
sj.add("BranchedAssemblyIds");

if(mConfig.LogPhaseGroupLinks)
sj.add("PhaseGroupLinkInfo");
sj.add("BranchedCount");

writer.write(sj.toString());
writer.newLine();
Expand Down Expand Up @@ -159,12 +156,7 @@ public void writeAssembly(final JunctionAssembly assembly)

sj.add(refSideSoftClipsStr(assembly.refSideSoftClips()));

String branchedAssemblyIds = assembly.branchedAssemblies().stream()
.map(x -> String.valueOf(x.id())).collect(Collectors.joining(";"));
sj.add(branchedAssemblyIds);

if(mConfig.LogPhaseGroupLinks)
sj.add(assembly.phaseGroupLinkingInfo());
sj.add(String.valueOf(assembly.branchedAssemblies().size()));

mWriter.write(sj.toString());
mWriter.newLine();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,60 +272,78 @@ protected static void addPhasingHeader(final StringJoiner sj)
sj.add("SvLength");
sj.add("InsertedBases");
sj.add("OverlapBases");
sj.add("RefLinkedFrags");

sj.add("SecondaryLinks");
}

private static final int NO_PHASE_ID = -1;

protected static void addPhasingInfo(final JunctionAssembly assembly, final StringJoiner sj)
{
PhaseGroup phaseGroup = assembly.phaseGroup();

int phaseGroupId = NO_PHASE_ID;
int phaseGroupCount = 0;
int phaseSetId = NO_PHASE_ID;
int phaseSetCount = 0;
String splitLinkInfo = "";
String facingLinkInfo = "";
String svType = "";
int svLinkLength = 0;
String insertedBases = "";
String overlapBases = "";
String linkFragmentInfo = "";
String secondaryLinkInfo = "";

if(phaseGroup != null)
{
sj.add(String.valueOf(phaseGroup.id()));
sj.add(String.valueOf(phaseGroup.assemblyCount()));
}
else
{
sj.add("-1").add("0");
}
phaseGroupId = phaseGroup.id();
phaseGroupCount = phaseGroup.assemblyCount();

PhaseSet phaseSet = phaseGroup != null ? phaseGroup.findPhaseSet(assembly) : null;
PhaseSet phaseSet = assembly.phaseSet();

if(phaseSet != null)
{
sj.add(String.valueOf(phaseSet.id()));
sj.add(String.valueOf(phaseSet.assemblies().size()));
if(phaseSet != null)
{
phaseSetId = phaseSet.id();
phaseSetCount = phaseSet.assemblies().size();

List<AssemblyLink> assemblyLinks = phaseSet.findAssemblyLinks(assembly);
List<AssemblyLink> assemblyLinks = phaseSet.findAssemblyLinks(assembly);

List<AssemblyLink> splitLinks = assemblyLinks.stream()
.filter(x -> x.type() == LinkType.SPLIT || x.type() == LinkType.INDEL).collect(Collectors.toList());
List<AssemblyLink> splitLinks = assemblyLinks.stream()
.filter(x -> x.type() == LinkType.SPLIT || x.type() == LinkType.INDEL).collect(Collectors.toList());

List<AssemblyLink> facingLinks = assemblyLinks.stream().filter(x -> x.type() == LinkType.FACING).collect(Collectors.toList());
sj.add(assemblyLinksStr(assembly, splitLinks));
sj.add(assemblyLinksStr(assembly, facingLinks));
List<AssemblyLink> facingLinks = assemblyLinks.stream().filter(x -> x.type() == LinkType.FACING).collect(Collectors.toList());
splitLinkInfo = assemblyLinksStr(assembly, splitLinks);
facingLinkInfo = assemblyLinksStr(assembly, facingLinks);

if(!splitLinks.isEmpty())
{
AssemblyLink svLink = splitLinks.get(0);
sj.add(svLink.svType().toString());
sj.add(String.valueOf(svLink.length()));
sj.add(svLink.insertedBases());
sj.add(svLink.overlapBases());
}
else
{
sj.add("").add("0").add("").add("");
if(!splitLinks.isEmpty())
{
AssemblyLink svLink = splitLinks.get(0);
svType = svLink.svType().toString();
svLinkLength = svLink.length();
insertedBases = svLink.insertedBases();
overlapBases = svLink.overlapBases();
linkFragmentInfo = assemblyLinkFragmentStr(svLink);
}

List<AssemblyLink> secondarySplitLinks = phaseGroup.findSecondarySplitLinks(assembly);
secondaryLinkInfo = assemblyLinksStr(assembly, secondarySplitLinks);
}
}
else
{
sj.add("-1").add("0").add("").add("").add("").add("0").add("").add("");
}

List<AssemblyLink> secondarySplitLinks = phaseGroup != null ? phaseGroup.findSecondarySplitLinks(assembly) : Collections.emptyList();
sj.add(assemblyLinksStr(assembly, secondarySplitLinks));
sj.add(String.valueOf(phaseGroupId));
sj.add(String.valueOf(phaseGroupCount));
sj.add(String.valueOf(phaseSetId));
sj.add(String.valueOf(phaseSetCount));
sj.add(splitLinkInfo);
sj.add(facingLinkInfo);
sj.add(svType);
sj.add(String.valueOf(svLinkLength));
sj.add(insertedBases);
sj.add(overlapBases);
sj.add(linkFragmentInfo);
sj.add(secondaryLinkInfo);
}

protected static String assemblyLinksStr(final JunctionAssembly assembly, final List<AssemblyLink> assemblyLinks)
Expand All @@ -343,6 +361,62 @@ protected static String assemblyLinksStr(final JunctionAssembly assembly, final
return sj.toString();
}

private static String assemblyLinkFragmentStr(final AssemblyLink assemblyLink)
{
int uniqueFrags = 0;
int uniqueRefFrags = 0;
int matchedFrags = 0;
int matchedRefFrags = 0;

JunctionAssembly first = assemblyLink.first();
JunctionAssembly second = assemblyLink.second();

Set<String> uniqueReadIds = Sets.newHashSet();

for(AssemblySupport firstSupport : first.support())
{
if(uniqueReadIds.contains(firstSupport.read().id()))
continue;

uniqueReadIds.add(firstSupport.read().id());

boolean isReference = firstSupport.read().isReference();

if(second.support().stream().anyMatch(x -> x.read().matchesFragment(firstSupport.read())))
{
++matchedFrags;

if(isReference)
++matchedRefFrags;
}
else
{
++uniqueFrags;

if(isReference)
++uniqueRefFrags;
}
}

for(AssemblySupport secondSupport : second.support())
{
if(uniqueReadIds.contains(secondSupport.read().id()))
continue;

uniqueReadIds.add(secondSupport.read().id());

if(first.support().stream().noneMatch(x -> x.read().matchesFragment(secondSupport.read())))
{
++uniqueFrags;

if(secondSupport.read().isReference())
++uniqueRefFrags;
}
}

return format("Match=%d;MatchRef=%d;Unique=%d;UniqueRef=%d", matchedFrags, matchedRefFrags, uniqueFrags, uniqueRefFrags);
}

protected static String repeatsInfoStr(final List<RepeatInfo> repeats)
{
if(repeats.isEmpty())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,8 @@ public void writeAssembly(final JunctionAssembly assembly)
return;

// write the assembly itself and then all its contributing reads
AssemblyLink assemblyLink = null;

if(assembly.phaseGroup() != null)
{
PhaseSet phaseSet = assembly.phaseGroup().findPhaseSet(assembly);

if(phaseSet != null)
assemblyLink = phaseSet.findSplitLink(assembly);
}
PhaseSet phaseSet = assembly.phaseSet();
AssemblyLink assemblyLink = phaseSet != null ? phaseSet.findSplitLink(assembly) : null;

String assemblyReadId = format("ASSEMBLY_%d_%s", assembly.id(), assembly.junction().coords());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ public void addVariant(final JunctionAssembly assembly)
genotypes.add(buildGenotype(assembly, sampleId, sampleData));
}

PhaseSet phaseSet = assembly.phaseGroup() != null ? assembly.phaseGroup().findPhaseSet(assembly) : null;
PhaseSet phaseSet = assembly.phaseSet();
List<AssemblyLink> assemblyLinks = phaseSet != null ? phaseSet.findAssemblyLinks(assembly) : Collections.emptyList();
AssemblyLink splitLink = assemblyLinks.stream().filter(x -> x.type() == LinkType.SPLIT).findFirst().orElse(null);
StructuralVariantType svType = splitLink != null ? splitLink.svType() : SGL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,7 @@ public void run()
}
}
}

public int totalRemoteReadsSearch() { return mRemoteRegionAssembler.totalRemoteReadsSearch(); }
public int totalRemoteReadsMatched() { return mRemoteRegionAssembler.totalRemoteReadsMatched(); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,10 @@ public IndelCoords indelCoords()
return mIndelCoords;
}

public boolean matchesFragment(final Read other) { return this == other || id().equals(other.id()); }
public boolean matchesFragment(final Read other)
{
return this == other || this == other.mateRead() || this == other.supplementaryRead() || id().equals(other.id());
}

public String toString()
{
Expand Down
Loading

0 comments on commit e2ea2fd

Please sign in to comment.