From b09dacb98ee30617fe16440d8c5e153d8e5e8b71 Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Thu, 29 Jun 2023 23:21:58 +0800 Subject: [PATCH] new command: csvtk fix. #226 --- CHANGELOG.md | 3 +- README.md | 7 +- csvtk/cmd/fix.go | 185 +++++++++++++++++++++++++++++++++++++ doc/docs/usage.md | 63 ++++++++++++- testdata/unequal_ncols.csv | 6 ++ 5 files changed, 259 insertions(+), 5 deletions(-) create mode 100644 csvtk/cmd/fix.go create mode 100644 testdata/unequal_ncols.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 24a3e0c..9039615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ - [csvtk v0.26.0](https://github.com/shenwei356/csvtk/releases/tag/v0.26.0) [![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/csvtk/v0.26.0/total.svg)](https://github.com/shenwei356/csvtk/releases/tag/v0.26.0) - `csvtk`: - - **near all commands skip empty file now**. [#204](https://github.com/shenwei356/csvtk/issues/204) + - **near all commands skip empty files now**. [#204](https://github.com/shenwei356/csvtk/issues/204) - the global flag `--infile-list` accepts stdin "-". [#210](https://github.com/shenwei356/csvtk/issues/210) + - new command `csvtk fix`: fix CSV/TSV with different numbers of columns in rows. [#226](https://github.com/shenwei356/csvtk/issues/226) - `csvtk pretty`: **rewrite to support wrapping cells**. [#206](https://github.com/shenwei356/csvtk/issues/206) [#209](https://github.com/shenwei356/csvtk/issues/209) [#228](https://github.com/shenwei356/csvtk/issues/228) - `csvtk cut/fmtdate/freq/grep/rename/rename2/replace/round`: allow duplicated column names. - `csvtk csv2xlsx`: optionally stores numbers as float. [#217](https://github.com/shenwei356/csvtk/issues/217) diff --git a/README.md b/README.md index 9e777e0..45b663c 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ It could save you lots of time in (not) writing Python/R scripts. ## Subcommands -49 subcommands in total. +50 subcommands in total. **Information** @@ -84,9 +84,9 @@ It could save you lots of time in (not) writing Python/R scripts. - [`space2tab`](https://bioinf.shenwei.me/csvtk/usage/#space2tab): converts space delimited format to TSVW - [`transpose`](https://bioinf.shenwei.me/csvtk/usage/#transpose): transposes CSV data - [`csv2md`](https://bioinf.shenwei.me/csvtk/usage/#csv2md): converts CSV to markdown format -- [`csv2rst`](https://bioinf.shenwei.me/csvtk/usage/#csv2rst): convert CSV to reStructuredText format +- [`csv2rst`](https://bioinf.shenwei.me/csvtk/usage/#csv2rst): converts CSV to reStructuredText format - [`csv2json`](https://bioinf.shenwei.me/csvtk/usage/#csv2json): converts CSV to JSON format -- [`csv2xlsx`](https://bioinf.shenwei.me/csvtk/usage/#csv2xlsx): convert CSV/TSV files to XLSX file +- [`csv2xlsx`](https://bioinf.shenwei.me/csvtk/usage/#csv2xlsx): converts CSV/TSV files to XLSX file - [`xlsx2csv`](https://bioinf.shenwei.me/csvtk/usage/#xlsx2csv): converts XLSX to CSV format **Set operations** @@ -108,6 +108,7 @@ It could save you lots of time in (not) writing Python/R scripts. **Edit** +- [`fix`](https://bioinf.shenwei.me/csvtk/usage/#fix): fix CSV/TSV with different numbers of columns in rows - [`add-header`](https://bioinf.shenwei.me/csvtk/usage/#add-header): add column names - [`del-header`](https://bioinf.shenwei.me/csvtk/usage/#del-header): delete column names - [`rename`](https://bioinf.shenwei.me/csvtk/usage/#rename): renames column names with new names diff --git a/csvtk/cmd/fix.go b/csvtk/cmd/fix.go new file mode 100644 index 0000000..3915732 --- /dev/null +++ b/csvtk/cmd/fix.go @@ -0,0 +1,185 @@ +// Copyright © 2016-2021 Wei Shen +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package cmd + +import ( + "encoding/csv" + "fmt" + "runtime" + + "github.com/shenwei356/xopen" + "github.com/spf13/cobra" +) + +// fixCmd represents the pretty command +var fixCmd = &cobra.Command{ + Use: "fix", + Short: "fix CSV/TSV with different numbers of columns in rows", + Long: `fix CSV/TSV with different numbers of columns in rows + +How to: + 1. First -n/--buf-rows rows are read to check the maximum number of columns. + The default value 0 means all rows will be read. + 2. Buffered and remaining rows with fewer columns are appended with empty + cells before output. + 3. An error will be reported if the number of columns of any remaining row + is larger than the maximum number of columns. + +`, + Run: func(cmd *cobra.Command, args []string) { + config := getConfigs(cmd) + files := getFileListFromArgsAndFile(cmd, args, true, "infile-list", true) + if len(files) > 1 { + checkError(fmt.Errorf("no more than one file should be given")) + } + runtime.GOMAXPROCS(config.NumCPUs) + + bufRows := getFlagNonNegativeInt(cmd, "buf-rows") + + var buf [][]string + var readAll bool + if bufRows > 0 { + buf = make([][]string, 0, bufRows) + } else { + readAll = true + buf = make([][]string, 0, 1024) + } + + outfh, err := xopen.Wopen(config.OutFile) + checkError(err) + defer outfh.Close() + + writer := csv.NewWriter(outfh) + if config.OutTabs || config.Tabs { + if config.OutDelimiter == ',' { // default value, no other value given + writer.Comma = '\t' + } else { + writer.Comma = config.OutDelimiter + } + } else { + writer.Comma = config.OutDelimiter + } + + file := files[0] + + csvReader, err := newCSVReaderByConfig(config, file) + + if err != nil { + if err == xopen.ErrNoContent { + log.Warningf("csvtk pretty: skipping empty input file: %s", file) + return + } + checkError(err) + } + + // very important. + // If FieldsPerRecord is negative, no check is made and + // records may have a variable number of fields. + csvReader.Reader.FieldsPerRecord = -1 + + csvReader.Run() + + var record []string + var n int // number of loaded rows + var maxN int + var checkedMaxNcols bool + var row []string + var ncol int + var empty []string + for chunk := range csvReader.Ch { + checkError(chunk.Err) + + for _, record = range chunk.Data { + n++ + + if readAll { + buf = append(buf, record) + continue + } + + buf = append(buf, record) + if !checkedMaxNcols { + if n == bufRows { + maxN = maxNcols(buf) + log.Infof("the maximum number of columns in first %d rows: %d", bufRows, maxN) + checkedMaxNcols = true + empty = make([]string, maxN) + + for _, row = range buf { + ncol = len(row) + if ncol < maxN { + row = append(row, empty[0:maxN-ncol]...) + } + writer.Write(row) + } + } + + continue + } + + ncol = len(record) + if ncol > maxN { + checkError(fmt.Errorf("line %d: the number of columns is larger than %d, please increase the value of -n/--buf-rows (%d)", n, maxN, bufRows)) + } else if ncol < maxN { + record = append(record, empty[0:maxN-ncol]...) + } + writer.Write(record) + } + } + + if readAll || !checkedMaxNcols { + maxN = maxNcols(buf) + empty = make([]string, maxN) + log.Infof("the maximum number of columns in all %d rows: %d", len(buf), maxN) + + for _, row = range buf { + ncol = len(row) + if ncol < maxN { + row = append(row, empty[0:maxN-ncol]...) + } + writer.Write(row) + } + } + + writer.Flush() + checkError(writer.Error()) + + readerReport(&config, csvReader, file) + }, +} + +func maxNcols(buf [][]string) int { + maxN := 0 + var ncol int + for _, row := range buf { + ncol = len(row) + if ncol > maxN { + maxN = ncol + } + } + return maxN +} + +func init() { + RootCmd.AddCommand(fixCmd) + + fixCmd.Flags().IntP("buf-rows", "n", 0, "the number of rows to determine the maximum number of columns. 0 for all rows.") +} diff --git a/doc/docs/usage.md b/doc/docs/usage.md index d23313e..8874d9e 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -59,6 +59,7 @@ **Edit** +- [fix](#fix) - [add-header](#add-header) - [del-header](#del-header) - [rename](#rename) @@ -2494,7 +2495,67 @@ b,c,d a,b,c,d ``` - + +## fix + +Usage + +```text +fix CSV/TSV with different numbers of columns in rows + +How to: + 1. First -n/--buf-rows rows are read to check the maximum number of columns. + The default value 0 means all rows will be read. + 2. Buffered and remaining rows with fewer columns are appended with empty + cells before output. + 3. An error will be reported if the number of columns of any remaining row + is larger than the maximum number of columns. + +Usage: + csvtk fix [flags] + +Flags: + -n, --buf-rows int the number of rows to determine the maximum number of columns. 0 for all rows. + -h, --help help for fix + + +``` + +Examples + +``` +$ cat testdata/unequal_ncols.csv +id,first_name,last_name +11,"Rob","Pike" +2,Ken,Thompson +4,"Robert","Griesemer","gri" +1,"Robert","Thompson","abc" +NA,"Robert" + + +$ cat testdata/unequal_ncols.csv | csvtk pretty +[ERRO] record on line 4: wrong number of fields + + + +$ cat testdata/unequal_ncols.csv | csvtk fix | csvtk pretty -S grid +[INFO] the maximum number of columns in all 6 rows: 4 ++----+------------+-----------+-----+ +| id | first_name | last_name | | ++====+============+===========+=====+ +| 11 | Rob | Pike | | ++----+------------+-----------+-----+ +| 2 | Ken | Thompson | | ++----+------------+-----------+-----+ +| 4 | Robert | Griesemer | gri | ++----+------------+-----------+-----+ +| 1 | Robert | Thompson | abc | ++----+------------+-----------+-----+ +| NA | Robert | | | ++----+------------+-----------+-----+ + +``` + ## add-header Usage diff --git a/testdata/unequal_ncols.csv b/testdata/unequal_ncols.csv new file mode 100644 index 0000000..0e488ba --- /dev/null +++ b/testdata/unequal_ncols.csv @@ -0,0 +1,6 @@ +id,first_name,last_name +11,"Rob","Pike" +2,Ken,Thompson +4,"Robert","Griesemer","gri" +1,"Robert","Thompson","abc" +NA,"Robert"