diff --git a/csvtk/cmd/helper.go b/csvtk/cmd/helper.go index 3510e05..79c961d 100644 --- a/csvtk/cmd/helper.go +++ b/csvtk/cmd/helper.go @@ -29,12 +29,13 @@ import ( "strings" "github.com/brentp/xopen" + "github.com/shenwei356/breader" "github.com/shenwei356/util/stringutil" "github.com/spf13/cobra" ) // VERSION of csvtk -const VERSION = "0.3.5.2" +const VERSION = "0.3.6" func checkError(err error) { if err != nil { @@ -468,3 +469,34 @@ func removeComma(s string) string { } return string(newSlice) } + +func readKVs(file string) (map[string]string, error) { + type KV [2]string + fn := func(line string) (interface{}, bool, error) { + if len(line) == 0 { + return nil, false, nil + } + items := strings.Split(strings.TrimRight(line, "\r\n"), "\t") + if len(items) < 2 { + return nil, false, nil + } + + return KV([2]string{items[0], items[1]}), true, nil + } + kvs := make(map[string]string) + reader, err := breader.NewBufferedReader(file, 2, 10, fn) + if err != nil { + return kvs, err + } + var items KV + for chunk := range reader.Ch { + if chunk.Err != nil { + return kvs, err + } + for _, data := range chunk.Data { + items = data.(KV) + kvs[items[0]] = items[1] + } + } + return kvs, nil +} diff --git a/csvtk/cmd/replace.go b/csvtk/cmd/replace.go index 5021341..3184b4a 100644 --- a/csvtk/cmd/replace.go +++ b/csvtk/cmd/replace.go @@ -25,6 +25,8 @@ import ( "fmt" "regexp" "runtime" + "strconv" + "strings" "github.com/brentp/xopen" "github.com/spf13/cobra" @@ -36,6 +38,26 @@ var replaceCmd = &cobra.Command{ Short: "replace data of selected fields by regular expression", Long: `replace data of selected fields by regular expression +Note that the replacement supports capture variables. +e.g. $1 represents the text of the first submatch. +ATTENTION: use SINGLE quote NOT double quotes in *nix OS. + +Examples: Adding space to all bases. + + csvtk replace -p "(.)" -r '$1 ' -s + +Or use the \ escape character. + + csvtk replace -p "(.)" -r "\$1 " -s + +more on: http://shenwei356.github.io/csvtk/usage/#replace + +Special repalcement symbols: + + {nr} Record number, starting from 1 + {kv} Corresponding value of the key ($1) by key-value file + + `, Run: func(cmd *cobra.Command, args []string) { config := getConfigs(cmd) @@ -51,6 +73,7 @@ var replaceCmd = &cobra.Command{ if pattern == "" { checkError(fmt.Errorf("flags -p (--pattern) needed")) } + p := pattern if ignoreCase { p = "(?i)" + p @@ -58,6 +81,44 @@ var replaceCmd = &cobra.Command{ patternRegexp, err := regexp.Compile(p) checkError(err) + kvFile := getFlagString(cmd, "kv-file") + + var replaceWithNR bool + if reNR.MatchString(replacement) { + replaceWithNR = true + } + + var replaceWithKV bool + var kvs map[string]string + if reKV.MatchString(replacement) { + replaceWithKV = true + if !regexp.MustCompile(`\(.+\)`).MatchString(pattern) { + checkError(fmt.Errorf(`value of -p (--pattern) must contains "(" and ")" to capture data which is used specify the KEY`)) + } + if kvFile == "" { + checkError(fmt.Errorf(`since repalcement symbol "{kv}"/"{KV}" found in value of flag -r (--replacement), tab-delimited key-value file should be given by flag -k (--kv-file)`)) + } + log.Infof("read key-value file: %s", kvFile) + var err error + kvs, err = readKVs(kvFile) + if err != nil { + checkError(fmt.Errorf("read key-value file: %s", err)) + } + if len(kvs) == 0 { + checkError(fmt.Errorf("no valid data in key-value file: %s", kvFile)) + } + + if ignoreCase { + kvs2 := make(map[string]string, len(kvs)) + for k, v := range kvs { + kvs2[strings.ToLower(k)] = v + } + kvs = kvs2 + } + + log.Infof("%d pairs of key-value loaded", len(kvs)) + } + fieldStr := getFlagString(cmd, "fields") fields, colnames, negativeFields, needParseHeaderRow := parseFields(cmd, fieldStr, config.NoHeaderRow) var fieldsMap map[int]struct{} @@ -102,7 +163,10 @@ var replaceCmd = &cobra.Command{ checkFields := true var record2 []string // for output - + var r string + var found []string + var k string + nr := 0 for chunk := range csvReader.Ch { checkError(chunk.Err) @@ -181,11 +245,33 @@ var replaceCmd = &cobra.Command{ parseHeaderRow2 = false continue } - + nr++ for f := range record { record2[f] = record[f] if _, ok := fieldsMap[f+1]; ok { - record2[f] = patternRegexp.ReplaceAllString(record2[f], replacement) + + r = replacement + + if replaceWithNR { + r = reNR.ReplaceAllString(r, strconv.Itoa(nr)) + } + + if replaceWithKV { + found = patternRegexp.FindStringSubmatch(record2[f]) + if len(found) > 0 { + k = string(found[1]) + if ignoreCase { + k = strings.ToLower(k) + } + if _, ok = kvs[k]; ok { + r = reKV.ReplaceAllString(r, kvs[k]) + } else { + r = reKV.ReplaceAllString(r, found[1]) + } + } + } + + record2[f] = patternRegexp.ReplaceAllString(record2[f], r) } } checkError(writer.Write(record2)) @@ -208,4 +294,9 @@ func init() { "ATTENTION: use SINGLE quote NOT double quotes in *nix OS or "+ "use the \\ escape character.") replaceCmd.Flags().BoolP("ignore-case", "i", false, "ignore case") + replaceCmd.Flags().StringP("kv-file", "k", "", + `tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement)`) } + +var reNR = regexp.MustCompile(`\{(NR|nr)\}`) +var reKV = regexp.MustCompile(`\{(KV|kv)\}`) diff --git a/doc/docs/bioinf.md b/doc/docs/bioinf.md new file mode 100644 index 0000000..3de41fc --- /dev/null +++ b/doc/docs/bioinf.md @@ -0,0 +1 @@ + diff --git a/doc/docs/download.md b/doc/docs/download.md index aec53aa..40fe3f6 100644 --- a/doc/docs/download.md +++ b/doc/docs/download.md @@ -6,9 +6,9 @@ ## Current Version -- [csvtk v0.3.5.2](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6) - - add flag `--fill` for `csvtk join`, so we can fill the unmatched data - - fix typo +- [csvtk v0.3.6](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6) + - `csvtk replace` support replacement symbols `{nr}` (record number) + and `{kv}` (corresponding value of the key ($1) by key-value file) ## Installation @@ -42,6 +42,9 @@ For Go developer, just one command: ## Previous Versions +- [csvtk v0.3.5.2](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6) + - add flag `--fill` for `csvtk join`, so we can fill the unmatched data + - fix typo - [csvtk v0.3.5.1](https://github.com/shenwei356/csvtk/releases/tag/v0.3.5.1) - fix minor bug of reading lines ending with `\r\n` from a dependency package - [csvtk v0.3.5](https://github.com/shenwei356/csvtk/releases/tag/v0.3.5) diff --git a/doc/docs/usage.md b/doc/docs/usage.md index c871dca..548589b 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -19,7 +19,7 @@ Usage ``` Another cross-platform, efficient and practical CSV/TSV toolkit -Version: 0.3.5.2 +Version: 0.3.6 Author: Wei Shen @@ -536,6 +536,25 @@ Usage ``` replace data of selected fields by regular expression +Note that the replacement supports capture variables. +e.g. $1 represents the text of the first submatch. +ATTENTION: use SINGLE quote NOT double quotes in *nix OS. + +Examples: Adding space to all bases. + + csvtk replace -p "(.)" -r '$1 ' -s + +Or use the \ escape character. + + csvtk replace -p "(.)" -r "\$1 " -s + +more on: http://shenwei356.github.io/csvtk/usage/#replace + +Special repalcement symbols: + + {nr} Record number, starting from 1 + {kv} Corresponding value of the key ($1) by key-value file + Usage: csvtk replace [flags] @@ -543,6 +562,7 @@ Flags: -f, --fields string select only these fields. e.g -f 1,2 or -f columnA,columnB (default "1") -F, --fuzzy-fields using fuzzy fileds, e.g. *name or id123* -i, --ignore-case ignore case + -k, --kv-file string tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement) -p, --pattern string search regular expression -r, --replacement string replacement. supporting capture variables. e.g. $1 represents the text of the first submatch. ATTENTION: use SINGLE quote NOT double quotes in *nix OS or use the \ escape character. @@ -551,6 +571,26 @@ Flags: Examples - remove Chinese charactors: `csvtk replace -F -f "*_name" -p "\p{Han}+" -r ""` +- replace by key-value files: `csvtk replace -f 1 -p "(.+)" -r "value of $1 is {kv}" -k kv.tsv` + + $ cat data.tsv + name id + A ID001 + B ID002 + C ID004 + + $ cat alias.tsv + 001 Tom + 002 Bob + 003 Jim + + $ csvtk replace -t -f 2 -p "ID(.+)" -r "N: {nr}, alias: {kv}" -k alias.tsv data.tsv + [INFO] read key-value file: alias.tsv + [INFO] 3 pairs of key-value loaded + name id + A N: 1, alias: Tom + B N: 2, alias: Bob + C N: 3, alias: 004 ## mutate diff --git a/doc/mkdocs.yml b/doc/mkdocs.yml index 3e3c851..c0bdcbf 100644 --- a/doc/mkdocs.yml +++ b/doc/mkdocs.yml @@ -2,8 +2,11 @@ site_name: csvtk - CSV/TSV Toolkit pages: - Home: index.md - Download: download.md -- Usage: usage.md -- Tutorial: tutorial.md +- Documents: + - Usage: usage.md + - Tutorial: tutorial.md +- Links: + - Wei Shen's Bioinformatic tools: bioinf.md theme: mkdocs repo_url: https://github.com/shenwei356/csvtk diff --git a/doc/site b/doc/site index 2bd7163..4057a2f 160000 --- a/doc/site +++ b/doc/site @@ -1 +1 @@ -Subproject commit 2bd71639719ae6245acc26969f2480d78328b7f6 +Subproject commit 4057a2facb79ee645a60d28c5396628217e675f3