From 7bda8bb441858795ce6b875fda8adb4bd0e8dcb8 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Mon, 19 Jun 2017 15:39:01 -0600 Subject: [PATCH] add by_alt to address #68 --- README.md | 1 + api/api.go | 49 ++++++++++++++++++++++----- api/api_test.go | 30 ++++++++++++++++ api/reducers.go | 1 + docs/CHANGES.md | 9 +++-- docs/index.md | 1 + tests/functional-test.sh | 2 ++ tests/id-test/small.toml | 2 +- tests/multiple-alts/ma-db.vcf.gz | Bin 495 -> 518 bytes tests/multiple-alts/ma-db.vcf.gz.tbi | Bin 164 -> 163 bytes tests/multiple-alts/ma.conf | 6 ++-- vcfanno.go | 2 +- 12 files changed, 88 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6d0bbc4..7dc10fb 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ this case, the op determines how the many values are `reduced`. Valid operations + min // numbers only + sum // numbers only + uniq // comma-delimited list of uniq vlues + + by_alt // comma-delimited by alt, pipe-delimited (|) for multiple annos for the same alt. There are some operations that are only for `postannotation`: diff --git a/api/api.go b/api/api.go index c70a544..b3f5caf 100644 --- a/api/api.go +++ b/api/api.go @@ -201,6 +201,24 @@ func allEqual(a, b []string) bool { return true } +// given the output from handleA and the alts: +// append new values to the appropriate alt. +// 22,33, A,G -> 22,33 +// then XX, G -> 22,33|G +// then YY, A -> 22|YY,33|G +func byAlt(in []interface{}, qAlts []string, existing [][]string) [][]string { + if existing == nil { + existing = make([][]string, len(qAlts)) + } + for i, v := range in { + if v == "." || v == "" || v == nil { + continue + } + existing[i] = append(existing[i], fmt.Sprintf("%v", v)) + } + return existing +} + // handleA converts the `val` to the correct slice of vals to match what's isnt // qAlts and oAlts. Then length of the returned value should always be equal // to the len of qAlts. @@ -257,6 +275,7 @@ func handleA(val interface{}, qAlts []string, oAlts []string, out []interface{}) func collect(v interfaces.IVariant, rels []interfaces.Relatable, src *Source, strict bool) ([]interface{}, error) { coll := make([]interface{}, 0, len(rels)) var val interface{} + var valByAlt [][]string var finalerr error for _, other := range rels { if int(other.Source())-1 != src.Index { @@ -293,15 +312,17 @@ func collect(v interfaces.IVariant, rels []interfaces.Relatable, src *Source, st continue } } + if src.Op == "by_alt" { + // with alt uses handleA machinery and then concats each value with then + // alternate allele. + out := make([]interface{}, len(v.Alt())) + handleA(val, v.Alt(), o.Alt(), out) + valByAlt = byAlt(out, v.Alt(), valByAlt) + continue + } - /* - if src.Field == "ID" || src.Field == "FILTER" { - coll = append(coll, val) - continue - } - */ // special-case 'self' when the annotation has Number=A and either query or anno have multiple alts - // note that if len(rels) > 1, we could miss some since we return here. however, that shouldn't happen as we are matching on ref and alt and we wouldn't know what to do anyway. + // so that we get the alts matched up. if src.NumberA && src.Op == "self" && src.Field != "ID" && src.Field != "FILTER" { var out []interface{} if len(coll) > 0 { @@ -390,6 +411,15 @@ func collect(v interfaces.IVariant, rels []interfaces.Relatable, src *Source, st coll = []interface{}{msg} } } + if valByAlt != nil { + for _, v := range valByAlt { + if len(v) == 0 { + coll = append(coll, ".") + } else { + coll = append(coll, strings.Join(v, "|")) + } + } + } return coll, finalerr } @@ -460,7 +490,10 @@ func (s *Source) AnnotateOne(v interfaces.IVariant, vals []interface{}, prefix s func (s *Source) UpdateHeader(r HeaderUpdater, ends bool, htype string, number string, desc string) { ntype := "String" // for 'self' and 'first', we can get the type from the header of the annotation file. - if htype != "" && (s.Op == "self" || s.Op == "first") { + if s.Op == "by_alt" { + number = "A" + ntype = htype + } else if htype != "" && (s.Op == "self" || s.Op == "first") { ntype = htype } else { if strings.HasSuffix(s.Name, "_float") { diff --git a/api/api_test.go b/api/api_test.go index be7b42e..a62d599 100644 --- a/api/api_test.go +++ b/api/api_test.go @@ -348,3 +348,33 @@ func TestHandlAMulti(t *testing.T) { } } + +// given the output from handleA and the alts: +// append new values to the appropriate alt. +// 22,33, A,G -> 22,33 +// then XX, G -> 22,33|G +// then YY, A -> 22|YY,33|G +// func byAlt(in []interface{}, qAlts []string, existing [][]string) [][]string { + +var byAltTests = []struct { + in []interface{} + out [][]string +}{ + {[]interface{}{"AAA", "."}, [][]string{[]string{"AAA"}, nil}}, + {[]interface{}{"AAA", "BBB"}, [][]string{[]string{"AAA"}, []string{"BBB"}}}, + {[]interface{}{".", "BBB"}, [][]string{nil, []string{"BBB"}}}, +} + +func TestByAlt(t *testing.T) { + + qAlts := []string{"C", "T"} + for _, tt := range byAltTests { + var existing [][]string + + existing = byAlt(tt.in, qAlts, existing) + + if !reflect.DeepEqual(existing, tt.out) { + t.Fatalf("got %v. expected %v", existing, tt.out) + } + } +} diff --git a/api/reducers.go b/api/reducers.go index 7867df0..ea9ec87 100644 --- a/api/reducers.go +++ b/api/reducers.go @@ -272,4 +272,5 @@ var Reducers = map[string]Reducer{ "div2": Reducer(div2), "DP2": Reducer(dp2), "setid": Reducer(setid), + "by_alt": Reducer(concat), } diff --git a/docs/CHANGES.md b/docs/CHANGES.md index a3bd486..f78ddc6 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -1,9 +1,14 @@ v0.2.7 (dev) ------------ + restore multiple threads per annotation file after fix in biogo/hts (#64) -+ add `setid` builtin to set the ID field and remove need for lua and fix some bugs. ++ new op `setid` builtin to set the ID field and remove need for lua and fix some bugs. + fix bug in `self` with multiple alternates when there were multiple overlaps (thanks Matthew). -+ fix for #68, #69 ++ fix for #68, #69 -- with Number=A, op="self", the output will always have a number of elements + equal to the number of alternates. If there are > 1 annos for a given site, later values will + overwrite previous. ++ new `op` by_alt that will have Number=A and will append multiple annotations for the same alternate + (from the same file) and output them as pipe-delimited. e.g with 2 alts, it might look like: `0.111,0.222|0.333` + when the 1st alternate has a single value and the 2nd alternate has 2 values. v0.2.6 ------ diff --git a/docs/index.md b/docs/index.md index 6d0bbc4..7dc10fb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -118,6 +118,7 @@ this case, the op determines how the many values are `reduced`. Valid operations + min // numbers only + sum // numbers only + uniq // comma-delimited list of uniq vlues + + by_alt // comma-delimited by alt, pipe-delimited (|) for multiple annos for the same alt. There are some operations that are only for `postannotation`: diff --git a/tests/functional-test.sh b/tests/functional-test.sh index eb02e2e..9ce18d6 100755 --- a/tests/functional-test.sh +++ b/tests/functional-test.sh @@ -138,6 +138,8 @@ multiallelics() { } run check_multiallelics multiallelics assert_exit_code 0 +assert_in_stdout "STR_by=hello,goodbye|goodbye-again" +assert_in_stdout "AF_by=0.000599042|0.99,0.00299521" idtest() { vcfanno -lua tests/id-test/some.lua tests/id-test/small.toml tests/id-test/small.vcf.gz diff --git a/tests/id-test/small.toml b/tests/id-test/small.toml index e3eb7cf..e5450bc 100644 --- a/tests/id-test/small.toml +++ b/tests/id-test/small.toml @@ -2,7 +2,7 @@ file="tests/id-test/dbsnp.small.vcf.gz" fields=["ID", "CAF"] names=["rs_ids", "CAF"] -ops=["self", "self"] +ops=["by_alt", "self"] [[annotation]] file="tests/id-test/cosmic.small.vcf.gz" diff --git a/tests/multiple-alts/ma-db.vcf.gz b/tests/multiple-alts/ma-db.vcf.gz index 8b9ac9090c884cfc57b9f99c7889309b34bb1f8e..c487c599aa613dba2a7b53c55b824b989ce7d099 100644 GIT binary patch delta 499 zcmVct#U2mH(7=^FhUtu*jo8(3Q#8@4Z zEE2j%Scar)1MRbc~+fjxVN- zG|J8=6Rm1%3e%1$S!44G*1Zrm2bg2@e7BG!!v1I2jk5 zOD_+EWWlB5B#9v+2KklWM^U8&B@^$+sno@%RIie1l1rzGoo+M8-;50PrL$4YQNb0Uvi@>Zt$# delta 476 zcmV<20VDo~1n&cXABzYC000000RIL6LPG)o(gAIdU2mH(6o#+eUtu*jn`A})#8@qp zED}1Ttmy*lJxv^nL?8oqQvUsHlB@$oTwuxPdEVD&HrrRV*|*)Hu+eLr{NZzQH=8Bt zDqn7*$8-_B=qy7E({H=#Xsfn~?sZ*5M_JSL@L2S{+1-DCb4>c2tOr8J==tODX1YkD z{B$%?+E`PXZfwb0Thy=_24TC0g%>u3HSScK+Vrq$;MSjaury6;ovEJ$C6l0}ZVNk( zS@yPaZ*!8f7VbOq`@=NbllR+q1G}Or4g7);`8w8T`DWtJY*#hqI3Q~e2KK{+$K&st z^$+l}&Oo|<0Gnk3^eP8Gf9VxC?-Y3b9RWm8$-x|G@B|3Pz^@tTI3kiM#3;cJI*IU{ zaY;FM^FhEu;^uShKcI&n%15jah+Q~LunH+h?u3O>m{aEvEC`kG5)e;A5hE9palyIt zRF+{{={OTW~s8WKGft8X=$HtBxb;O8MUrD7xj@g6TbM$WNx*a&h7`vHC z-CnMwkP5rb|HXe=P}#P-x0CTp3tc6@<)>*2MCHH!H)=>6Fh&P3p41@x^8r*UAuctf S3MIW&<1d1LF`;9V=K&vESLi?h diff --git a/tests/multiple-alts/ma-db.vcf.gz.tbi b/tests/multiple-alts/ma-db.vcf.gz.tbi index b4dd148b4dba0bcd0ff69acd1144878e65909a16..377d9c3927098f590032e6df86afa6279e67fdcc 100644 GIT binary patch delta 110 zcmV-!0FnQs0iyvCABzYC000000RIL6LPG)ohLI5=Q7bTuivblB2|-NKVTSNip>zpU z9L9&~_k@bW%v}zpGoj+Ip?oQ*)3TxBFh0y4U#K`tKTMwvlG(WE<-?qQjYh-5j#L^J Q*648y0|3@W0dtXrAZ&CjdjJ3c delta 140 zcmZ3?xP(zqzMF%E0R;ZDF*rFhF|dQa-l5HX-R<`6)`GpLZ Z7$*jXszV!l{g@aS(5;eYU