diff --git a/csvtk/cmd/helper.go b/csvtk/cmd/helper.go
index 3510e05..79c961d 100644
--- a/csvtk/cmd/helper.go
+++ b/csvtk/cmd/helper.go
@@ -29,12 +29,13 @@ import (
"strings"
"github.com/brentp/xopen"
+ "github.com/shenwei356/breader"
"github.com/shenwei356/util/stringutil"
"github.com/spf13/cobra"
)
// VERSION of csvtk
-const VERSION = "0.3.5.2"
+const VERSION = "0.3.6"
func checkError(err error) {
if err != nil {
@@ -468,3 +469,34 @@ func removeComma(s string) string {
}
return string(newSlice)
}
+
+func readKVs(file string) (map[string]string, error) {
+ type KV [2]string
+ fn := func(line string) (interface{}, bool, error) {
+ if len(line) == 0 {
+ return nil, false, nil
+ }
+ items := strings.Split(strings.TrimRight(line, "\r\n"), "\t")
+ if len(items) < 2 {
+ return nil, false, nil
+ }
+
+ return KV([2]string{items[0], items[1]}), true, nil
+ }
+ kvs := make(map[string]string)
+ reader, err := breader.NewBufferedReader(file, 2, 10, fn)
+ if err != nil {
+ return kvs, err
+ }
+ var items KV
+ for chunk := range reader.Ch {
+ if chunk.Err != nil {
+ return kvs, err
+ }
+ for _, data := range chunk.Data {
+ items = data.(KV)
+ kvs[items[0]] = items[1]
+ }
+ }
+ return kvs, nil
+}
diff --git a/csvtk/cmd/replace.go b/csvtk/cmd/replace.go
index 5021341..3184b4a 100644
--- a/csvtk/cmd/replace.go
+++ b/csvtk/cmd/replace.go
@@ -25,6 +25,8 @@ import (
"fmt"
"regexp"
"runtime"
+ "strconv"
+ "strings"
"github.com/brentp/xopen"
"github.com/spf13/cobra"
@@ -36,6 +38,26 @@ var replaceCmd = &cobra.Command{
Short: "replace data of selected fields by regular expression",
Long: `replace data of selected fields by regular expression
+Note that the replacement supports capture variables.
+e.g. $1 represents the text of the first submatch.
+ATTENTION: use SINGLE quote NOT double quotes in *nix OS.
+
+Examples: Adding space to all bases.
+
+ csvtk replace -p "(.)" -r '$1 ' -s
+
+Or use the \ escape character.
+
+ csvtk replace -p "(.)" -r "\$1 " -s
+
+more on: http://shenwei356.github.io/csvtk/usage/#replace
+
+Special repalcement symbols:
+
+ {nr} Record number, starting from 1
+ {kv} Corresponding value of the key ($1) by key-value file
+
+
`,
Run: func(cmd *cobra.Command, args []string) {
config := getConfigs(cmd)
@@ -51,6 +73,7 @@ var replaceCmd = &cobra.Command{
if pattern == "" {
checkError(fmt.Errorf("flags -p (--pattern) needed"))
}
+
p := pattern
if ignoreCase {
p = "(?i)" + p
@@ -58,6 +81,44 @@ var replaceCmd = &cobra.Command{
patternRegexp, err := regexp.Compile(p)
checkError(err)
+ kvFile := getFlagString(cmd, "kv-file")
+
+ var replaceWithNR bool
+ if reNR.MatchString(replacement) {
+ replaceWithNR = true
+ }
+
+ var replaceWithKV bool
+ var kvs map[string]string
+ if reKV.MatchString(replacement) {
+ replaceWithKV = true
+ if !regexp.MustCompile(`\(.+\)`).MatchString(pattern) {
+ checkError(fmt.Errorf(`value of -p (--pattern) must contains "(" and ")" to capture data which is used specify the KEY`))
+ }
+ if kvFile == "" {
+ checkError(fmt.Errorf(`since repalcement symbol "{kv}"/"{KV}" found in value of flag -r (--replacement), tab-delimited key-value file should be given by flag -k (--kv-file)`))
+ }
+ log.Infof("read key-value file: %s", kvFile)
+ var err error
+ kvs, err = readKVs(kvFile)
+ if err != nil {
+ checkError(fmt.Errorf("read key-value file: %s", err))
+ }
+ if len(kvs) == 0 {
+ checkError(fmt.Errorf("no valid data in key-value file: %s", kvFile))
+ }
+
+ if ignoreCase {
+ kvs2 := make(map[string]string, len(kvs))
+ for k, v := range kvs {
+ kvs2[strings.ToLower(k)] = v
+ }
+ kvs = kvs2
+ }
+
+ log.Infof("%d pairs of key-value loaded", len(kvs))
+ }
+
fieldStr := getFlagString(cmd, "fields")
fields, colnames, negativeFields, needParseHeaderRow := parseFields(cmd, fieldStr, config.NoHeaderRow)
var fieldsMap map[int]struct{}
@@ -102,7 +163,10 @@ var replaceCmd = &cobra.Command{
checkFields := true
var record2 []string // for output
-
+ var r string
+ var found []string
+ var k string
+ nr := 0
for chunk := range csvReader.Ch {
checkError(chunk.Err)
@@ -181,11 +245,33 @@ var replaceCmd = &cobra.Command{
parseHeaderRow2 = false
continue
}
-
+ nr++
for f := range record {
record2[f] = record[f]
if _, ok := fieldsMap[f+1]; ok {
- record2[f] = patternRegexp.ReplaceAllString(record2[f], replacement)
+
+ r = replacement
+
+ if replaceWithNR {
+ r = reNR.ReplaceAllString(r, strconv.Itoa(nr))
+ }
+
+ if replaceWithKV {
+ found = patternRegexp.FindStringSubmatch(record2[f])
+ if len(found) > 0 {
+ k = string(found[1])
+ if ignoreCase {
+ k = strings.ToLower(k)
+ }
+ if _, ok = kvs[k]; ok {
+ r = reKV.ReplaceAllString(r, kvs[k])
+ } else {
+ r = reKV.ReplaceAllString(r, found[1])
+ }
+ }
+ }
+
+ record2[f] = patternRegexp.ReplaceAllString(record2[f], r)
}
}
checkError(writer.Write(record2))
@@ -208,4 +294,9 @@ func init() {
"ATTENTION: use SINGLE quote NOT double quotes in *nix OS or "+
"use the \\ escape character.")
replaceCmd.Flags().BoolP("ignore-case", "i", false, "ignore case")
+ replaceCmd.Flags().StringP("kv-file", "k", "",
+ `tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement)`)
}
+
+var reNR = regexp.MustCompile(`\{(NR|nr)\}`)
+var reKV = regexp.MustCompile(`\{(KV|kv)\}`)
diff --git a/doc/docs/bioinf.md b/doc/docs/bioinf.md
new file mode 100644
index 0000000..3de41fc
--- /dev/null
+++ b/doc/docs/bioinf.md
@@ -0,0 +1 @@
+
diff --git a/doc/docs/download.md b/doc/docs/download.md
index aec53aa..40fe3f6 100644
--- a/doc/docs/download.md
+++ b/doc/docs/download.md
@@ -6,9 +6,9 @@
## Current Version
-- [csvtk v0.3.5.2](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6)
- - add flag `--fill` for `csvtk join`, so we can fill the unmatched data
- - fix typo
+- [csvtk v0.3.6](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6)
+ - `csvtk replace` support replacement symbols `{nr}` (record number)
+ and `{kv}` (corresponding value of the key ($1) by key-value file)
## Installation
@@ -42,6 +42,9 @@ For Go developer, just one command:
## Previous Versions
+- [csvtk v0.3.5.2](https://github.com/shenwei356/csvtk/releases/tag/v0.3.6)
+ - add flag `--fill` for `csvtk join`, so we can fill the unmatched data
+ - fix typo
- [csvtk v0.3.5.1](https://github.com/shenwei356/csvtk/releases/tag/v0.3.5.1)
- fix minor bug of reading lines ending with `\r\n` from a dependency package
- [csvtk v0.3.5](https://github.com/shenwei356/csvtk/releases/tag/v0.3.5)
diff --git a/doc/docs/usage.md b/doc/docs/usage.md
index c871dca..548589b 100644
--- a/doc/docs/usage.md
+++ b/doc/docs/usage.md
@@ -19,7 +19,7 @@ Usage
```
Another cross-platform, efficient and practical CSV/TSV toolkit
-Version: 0.3.5.2
+Version: 0.3.6
Author: Wei Shen
@@ -536,6 +536,25 @@ Usage
```
replace data of selected fields by regular expression
+Note that the replacement supports capture variables.
+e.g. $1 represents the text of the first submatch.
+ATTENTION: use SINGLE quote NOT double quotes in *nix OS.
+
+Examples: Adding space to all bases.
+
+ csvtk replace -p "(.)" -r '$1 ' -s
+
+Or use the \ escape character.
+
+ csvtk replace -p "(.)" -r "\$1 " -s
+
+more on: http://shenwei356.github.io/csvtk/usage/#replace
+
+Special repalcement symbols:
+
+ {nr} Record number, starting from 1
+ {kv} Corresponding value of the key ($1) by key-value file
+
Usage:
csvtk replace [flags]
@@ -543,6 +562,7 @@ Flags:
-f, --fields string select only these fields. e.g -f 1,2 or -f columnA,columnB (default "1")
-F, --fuzzy-fields using fuzzy fileds, e.g. *name or id123*
-i, --ignore-case ignore case
+ -k, --kv-file string tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement)
-p, --pattern string search regular expression
-r, --replacement string replacement. supporting capture variables. e.g. $1 represents the text of the first submatch. ATTENTION: use SINGLE quote NOT double quotes in *nix OS or use the \ escape character.
@@ -551,6 +571,26 @@ Flags:
Examples
- remove Chinese charactors: `csvtk replace -F -f "*_name" -p "\p{Han}+" -r ""`
+- replace by key-value files: `csvtk replace -f 1 -p "(.+)" -r "value of $1 is {kv}" -k kv.tsv`
+
+ $ cat data.tsv
+ name id
+ A ID001
+ B ID002
+ C ID004
+
+ $ cat alias.tsv
+ 001 Tom
+ 002 Bob
+ 003 Jim
+
+ $ csvtk replace -t -f 2 -p "ID(.+)" -r "N: {nr}, alias: {kv}" -k alias.tsv data.tsv
+ [INFO] read key-value file: alias.tsv
+ [INFO] 3 pairs of key-value loaded
+ name id
+ A N: 1, alias: Tom
+ B N: 2, alias: Bob
+ C N: 3, alias: 004
## mutate
diff --git a/doc/mkdocs.yml b/doc/mkdocs.yml
index 3e3c851..c0bdcbf 100644
--- a/doc/mkdocs.yml
+++ b/doc/mkdocs.yml
@@ -2,8 +2,11 @@ site_name: csvtk - CSV/TSV Toolkit
pages:
- Home: index.md
- Download: download.md
-- Usage: usage.md
-- Tutorial: tutorial.md
+- Documents:
+ - Usage: usage.md
+ - Tutorial: tutorial.md
+- Links:
+ - Wei Shen's Bioinformatic tools: bioinf.md
theme: mkdocs
repo_url: https://github.com/shenwei356/csvtk
diff --git a/doc/site b/doc/site
index 2bd7163..4057a2f 160000
--- a/doc/site
+++ b/doc/site
@@ -1 +1 @@
-Subproject commit 2bd71639719ae6245acc26969f2480d78328b7f6
+Subproject commit 4057a2facb79ee645a60d28c5396628217e675f3