Skip to content

Commit

Permalink
mutate2: fix bug when using two or more columns with common prefixes …
Browse files Browse the repository at this point in the history
…in column names. fix #173
  • Loading branch information
shenwei356 committed Nov 10, 2021
1 parent 66572fc commit 3e3dbf7
Show file tree
Hide file tree
Showing 14 changed files with 190 additions and 125 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ csvtk/binaries*
doc/site
*ssshtest
.vscode
.brename_detail.txt
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- `csvtk mutate2/filter2`:
- change the way of rexpression evaluation.
- add custom functions: `len()`. [#153](https://github.com/shenwei356/csvtk/issues/153)
- fix bug when using two or more columns with common prefixes in column names. [#173](https://github.com/shenwei356/csvtk/issues/173)
- `csvtk cut`: new flags `-m/--allow-missing-col` and `-b/--blank-missing-col`. [#156](https://github.com/shenwei356/csvtk/issues/156)
- `csvtk pretty`: still add header row for empty column.
- `csvtk csv2md`: better format.
Expand Down
1 change: 0 additions & 1 deletion csvtk/.brename_detail.txt

This file was deleted.

2 changes: 0 additions & 2 deletions csvtk/cmd/cut.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,6 @@ Examples:
if !allowMissingColumn {
checkError(fmt.Errorf(`column "%s" not existed in file: %s`, col[1:], file))
}
} else {

}
} else {
if _, ok := colnames2fileds[col]; !ok {
Expand Down
3 changes: 0 additions & 3 deletions csvtk/cmd/dim.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,6 @@ var dimCmd = &cobra.Command{
if numRows > 0 && !config.NoHeaderRow {
numRows--
}
if numRows < 0 {
numRows = 0
}

if rows {
if noFiles {
Expand Down
75 changes: 61 additions & 14 deletions csvtk/cmd/filter2.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"fmt"
"regexp"
"runtime"
"sort"
"strconv"
"strings"

Expand Down Expand Up @@ -85,6 +86,8 @@ Custom functions:
checkError(fmt.Errorf("invalid filter: %s", filterStr))
}

digitsAsString := getFlagBool(cmd, "numeric-as-string")

fs := make([]string, 0)
for _, f := range reFilter2.FindAllStringSubmatch(filterStr, -1) {
fs = append(fs, f[1])
Expand Down Expand Up @@ -140,6 +143,8 @@ Custom functions:

// -----------------------------------

hasNullCoalescence := reNullCoalescence.MatchString(filterStr)

var quote string = `'`
if strings.Contains(filterStr, `"`) {
quote = `"`
Expand Down Expand Up @@ -198,6 +203,8 @@ Custom functions:
var colnamesMap map[string]*regexp.Regexp

parameters := make(map[string]string, len(colnamesMap))
parameters2 := make(map[string]interface{}, len(colnamesMap))
parameters2["shenweiNULL"] = nil

checkFields := true
var flag bool
Expand All @@ -209,6 +216,8 @@ Custom functions:
var recordWithN []string
var valueFloat float64

keys := make([]string, 0, 8)

printMetaLine := true
for chunk := range csvReader.Ch {
checkError(chunk.Err)
Expand Down Expand Up @@ -311,17 +320,25 @@ Custom functions:
checkFields = false
}

flag = false
// prepaire parameters
if !usingColname {
for _, fieldTmp = range fields {
value = record[fieldTmp-1]
col = fmt.Sprintf("shenwei%d", fieldTmp)

if reDigitals.MatchString(value) {
valueFloat, _ = strconv.ParseFloat(removeComma(value), 64)
parameters[col] = fmt.Sprintf("%.16f", valueFloat)
if digitsAsString || containCustomFuncs {
parameters[col] = quote + value + quote
} else {
valueFloat, _ = strconv.ParseFloat(removeComma(value), 64)
parameters[col] = fmt.Sprintf("%.16f", valueFloat)
}
} else {
parameters[col] = quote + value + quote
if value == "" && hasNullCoalescence {
parameters[col] = "shenweiNULL"
} else {
parameters[col] = quote + value + quote
}
}
}
} else {
Expand All @@ -335,26 +352,54 @@ Custom functions:
}

if reDigitals.MatchString(value) {
valueFloat, _ = strconv.ParseFloat(removeComma(value), 64)
parameters[col] = fmt.Sprintf("%.16f", valueFloat)
if digitsAsString || containCustomFuncs {
parameters[col] = quote + value + quote
} else {
valueFloat, _ = strconv.ParseFloat(removeComma(value), 64)
parameters[col] = fmt.Sprintf("%.16f", valueFloat)
}
} else {
parameters[col] = quote + value + quote
if value == "" && hasNullCoalescence {
parameters[col] = "shenweiNULL"
} else {
parameters[col] = quote + value + quote
}
}
}
}

// sort variable names by length, so we can replace variables in the right order.
// e.g., for -e '$reads_mapped/$reads', we should firstly replace $reads_mapped then $reads.
keys = keys[:0]
for col = range parameters {
keys = append(keys, col)
}
sort.Slice(keys, func(i, j int) bool {
return len(keys[i]) > len(keys[j])
})

// replace variable with column data
filterStr1 = filterStr
for col, value = range parameters {
filterStr1 = strings.ReplaceAll(filterStr1, col, value)
for _, col = range keys {
filterStr1 = strings.ReplaceAll(filterStr1, col, parameters[col])
}

// evaluate
if containCustomFuncs {
expression, err = govaluate.NewEvaluableExpressionWithFunctions(filterStr1, functions)
} else {
expression, err = govaluate.NewEvaluableExpression(filterStr1)
}
checkError(err)

result, err = expression.Evaluate(emptyParams)
// check result
flag = false

if hasNullCoalescence {
result, err = expression.Evaluate(parameters2)
} else {
result, err = expression.Evaluate(emptyParams)
}
if err != nil {
flag = false
log.Warningf("row %d: %s", N, err)
Expand Down Expand Up @@ -393,12 +438,14 @@ func init() {
RootCmd.AddCommand(filter2Cmd)
filter2Cmd.Flags().StringP("filter", "f", "", `awk-like filter condition. e.g. '$age>12' or '$1 > $3' or '$name=="abc"' or '$1 % 2 == 0'`)
filter2Cmd.Flags().BoolP("line-number", "n", false, `print line number as the first column ("n")`)
filter2Cmd.Flags().BoolP("numeric-as-string", "s", false, `treat even numeric fields as strings to avoid converting big numbers into scientific notation`)
}

var reFilter2 = regexp.MustCompile(`\$([^ +-/*&\|^%><!~=()]+)`)
var reFilter2 = regexp.MustCompile(`\$([^ +-/*&\|^%><!~=()"']+)`)
var reFilter2VarField = regexp.MustCompile(`\$(\d+)`)
var reFilter2VarSymbol = regexp.MustCompile(`\$`)

// var reFilter2VarSymbol = regexp.MustCompile(`\$`)

// special colname starting with digits, e.g., 123abc
var reFiler2VarSymbolStartsWithDigits = regexp.MustCompile(`\$(\d+)([^\d +-/*&\|^%><!~=()]+)`) // for preprocess expression
var reFiler2ColSymbolStartsWithDigits = regexp.MustCompile(`^(\d+)([^\d +-/*&\|^%><!~=()]+)`) // for preparing paramters
var reFiler2VarSymbolStartsWithDigits = regexp.MustCompile(`\$(\d+)([^\d +-/*&\|^%><!~=()"']+)`) // for preprocess expression
var reFiler2ColSymbolStartsWithDigits = regexp.MustCompile(`^(\d+)([^\d +-/*&\|^%><!~=()"']+)`) // for preparing paramters
2 changes: 1 addition & 1 deletion csvtk/cmd/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ func parseFields(cmd *cobra.Command,
}

func fuzzyField2Regexp(field string) *regexp.Regexp {
if strings.IndexAny(field, "*") >= 0 {
if strings.ContainsAny(field, "*") {
field = strings.Replace(field, "*", ".*?", -1)
}

Expand Down
4 changes: 1 addition & 3 deletions csvtk/cmd/inter.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,7 @@ Attention:
items[i] = record[f-1]
}
itemsCopy := make([]string, len(items))
for i, item := range items {
itemsCopy[i] = item
}
copy(itemsCopy, items)
valuesMaps[key] = itemsCopy
continue
}
Expand Down
6 changes: 3 additions & 3 deletions csvtk/cmd/line.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Notes:
if dataFieldXStr == "" {
checkError(fmt.Errorf("flag -x (--data-field-x) needed"))
}
if strings.Index(dataFieldXStr, ",") >= 0 {
if strings.Contains(dataFieldXStr, ",") {
checkError(fmt.Errorf("only one field allowed for flag -x (--data-field-x)"))
}
if dataFieldXStr[0] == '-' {
Expand All @@ -85,7 +85,7 @@ Notes:
if dataFieldYStr == "" {
checkError(fmt.Errorf("flag -y (--data-field-y) needed"))
}
if strings.Index(dataFieldYStr, ",") >= 0 {
if strings.Contains(dataFieldYStr, ",") {
checkError(fmt.Errorf("only one field allowed for flag -y (--data-field-y)"))
}
if dataFieldXStr[0] == '-' {
Expand All @@ -94,7 +94,7 @@ Notes:

groupFieldStr := getFlagString(cmd, "group-field")
if len(groupFieldStr) > 0 {
if strings.Index(groupFieldStr, ",") >= 0 {
if strings.Contains(groupFieldStr, ",") {
checkError(fmt.Errorf("only one field allowed for flag --group-field"))
}
if groupFieldStr[0] == '-' {
Expand Down
37 changes: 22 additions & 15 deletions csvtk/cmd/mutate.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ var mutateCmd = &cobra.Command{
checkFields := true

var record2 []string // for output
var ok bool

printMetaLine := true
for chunk := range csvReader.Ch {
Expand Down Expand Up @@ -228,26 +229,32 @@ var mutateCmd = &cobra.Command{
} else {
record2 = record
}

if handleHeaderRow {
record2 = append(record2, name)
handleHeaderRow = false
checkError(writer.Write(record2))
continue
}

for f := range record {
// record2[f] = record[f]
if _, ok := fieldsMap[f+1]; ok {
if handleHeaderRow {
record2 = append(record2, name)
handleHeaderRow = false
_, ok = fieldsMap[f+1]
if !ok {
continue
}

if patternRegexp.MatchString(record[f]) {
found := patternRegexp.FindAllStringSubmatch(record[f], -1)
record2 = append(record2, found[0][1])
} else {
if naUnmatched {
record2 = append(record2, "")
} else {
if patternRegexp.MatchString(record[f]) {
found := patternRegexp.FindAllStringSubmatch(record[f], -1)
record2 = append(record2, found[0][1])
} else {
if naUnmatched {
record2 = append(record2, "")
} else {
record2 = append(record2, record[f])
}
}
record2 = append(record2, record[f])
}
break
}
break
}
checkError(writer.Write(record2))
}
Expand Down
Loading

0 comments on commit 3e3dbf7

Please sign in to comment.