From b09dacb98ee30617fe16440d8c5e153d8e5e8b71 Mon Sep 17 00:00:00 2001
From: Wei Shen <shenwei356@gmail.com>
Date: Thu, 29 Jun 2023 23:21:58 +0800
Subject: [PATCH] new command: csvtk fix. #226

---
 CHANGELOG.md               |   3 +-
 README.md                  |   7 +-
 csvtk/cmd/fix.go           | 185 +++++++++++++++++++++++++++++++++++++
 doc/docs/usage.md          |  63 ++++++++++++-
 testdata/unequal_ncols.csv |   6 ++
 5 files changed, 259 insertions(+), 5 deletions(-)
 create mode 100644 csvtk/cmd/fix.go
 create mode 100644 testdata/unequal_ncols.csv

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 24a3e0c..9039615 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,9 @@
 - [csvtk v0.26.0](https://github.com/shenwei356/csvtk/releases/tag/v0.26.0)
 [![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/csvtk/v0.26.0/total.svg)](https://github.com/shenwei356/csvtk/releases/tag/v0.26.0)
   - `csvtk`: 
-      - **near all commands skip empty file now**. [#204](https://github.com/shenwei356/csvtk/issues/204)
+      - **near all commands skip empty files now**. [#204](https://github.com/shenwei356/csvtk/issues/204)
       - the global flag `--infile-list` accepts stdin "-". [#210](https://github.com/shenwei356/csvtk/issues/210)
+  - new command `csvtk fix`: fix CSV/TSV with different numbers of columns in rows. [#226](https://github.com/shenwei356/csvtk/issues/226)
   - `csvtk pretty`: **rewrite to support wrapping cells**. [#206](https://github.com/shenwei356/csvtk/issues/206) [#209](https://github.com/shenwei356/csvtk/issues/209)  [#228](https://github.com/shenwei356/csvtk/issues/228)
   - `csvtk cut/fmtdate/freq/grep/rename/rename2/replace/round`: allow duplicated column names.
   - `csvtk csv2xlsx`: optionally stores numbers as float. [#217](https://github.com/shenwei356/csvtk/issues/217)
diff --git a/README.md b/README.md
index 9e777e0..45b663c 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ It could save you lots of time in (not) writing Python/R scripts.
 
 ## Subcommands
 
-49 subcommands in total.
+50 subcommands in total.
 
 **Information**
 
@@ -84,9 +84,9 @@ It could save you lots of time in (not) writing Python/R scripts.
 - [`space2tab`](https://bioinf.shenwei.me/csvtk/usage/#space2tab): converts space delimited format to TSVW
 - [`transpose`](https://bioinf.shenwei.me/csvtk/usage/#transpose): transposes CSV data
 - [`csv2md`](https://bioinf.shenwei.me/csvtk/usage/#csv2md): converts CSV to markdown format
-- [`csv2rst`](https://bioinf.shenwei.me/csvtk/usage/#csv2rst): convert CSV to reStructuredText format
+- [`csv2rst`](https://bioinf.shenwei.me/csvtk/usage/#csv2rst): converts CSV to reStructuredText format
 - [`csv2json`](https://bioinf.shenwei.me/csvtk/usage/#csv2json): converts CSV to JSON format
-- [`csv2xlsx`](https://bioinf.shenwei.me/csvtk/usage/#csv2xlsx): convert CSV/TSV files to XLSX file
+- [`csv2xlsx`](https://bioinf.shenwei.me/csvtk/usage/#csv2xlsx): converts CSV/TSV files to XLSX file
 - [`xlsx2csv`](https://bioinf.shenwei.me/csvtk/usage/#xlsx2csv): converts XLSX to CSV format
 
 **Set operations**
@@ -108,6 +108,7 @@ It could save you lots of time in (not) writing Python/R scripts.
 
 **Edit**
 
+- [`fix`](https://bioinf.shenwei.me/csvtk/usage/#fix): fix CSV/TSV with different numbers of columns in rows
 - [`add-header`](https://bioinf.shenwei.me/csvtk/usage/#add-header): add column names
 - [`del-header`](https://bioinf.shenwei.me/csvtk/usage/#del-header): delete column names
 - [`rename`](https://bioinf.shenwei.me/csvtk/usage/#rename): renames column names with new names
diff --git a/csvtk/cmd/fix.go b/csvtk/cmd/fix.go
new file mode 100644
index 0000000..3915732
--- /dev/null
+++ b/csvtk/cmd/fix.go
@@ -0,0 +1,185 @@
+// Copyright © 2016-2021 Wei Shen <shenwei356@gmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package cmd
+
+import (
+	"encoding/csv"
+	"fmt"
+	"runtime"
+
+	"github.com/shenwei356/xopen"
+	"github.com/spf13/cobra"
+)
+
+// fixCmd represents the pretty command
+var fixCmd = &cobra.Command{
+	Use:   "fix",
+	Short: "fix CSV/TSV with different numbers of columns in rows",
+	Long: `fix CSV/TSV with different numbers of columns in rows
+
+How to:
+  1. First -n/--buf-rows rows are read to check the maximum number of columns.
+     The default value 0 means all rows will be read.
+  2. Buffered and remaining rows with fewer columns are appended with empty
+     cells before output.
+  3. An error will be reported if the number of columns of any remaining row
+     is larger than the maximum number of columns.
+
+`,
+	Run: func(cmd *cobra.Command, args []string) {
+		config := getConfigs(cmd)
+		files := getFileListFromArgsAndFile(cmd, args, true, "infile-list", true)
+		if len(files) > 1 {
+			checkError(fmt.Errorf("no more than one file should be given"))
+		}
+		runtime.GOMAXPROCS(config.NumCPUs)
+
+		bufRows := getFlagNonNegativeInt(cmd, "buf-rows")
+
+		var buf [][]string
+		var readAll bool
+		if bufRows > 0 {
+			buf = make([][]string, 0, bufRows)
+		} else {
+			readAll = true
+			buf = make([][]string, 0, 1024)
+		}
+
+		outfh, err := xopen.Wopen(config.OutFile)
+		checkError(err)
+		defer outfh.Close()
+
+		writer := csv.NewWriter(outfh)
+		if config.OutTabs || config.Tabs {
+			if config.OutDelimiter == ',' { // default value, no other value given
+				writer.Comma = '\t'
+			} else {
+				writer.Comma = config.OutDelimiter
+			}
+		} else {
+			writer.Comma = config.OutDelimiter
+		}
+
+		file := files[0]
+
+		csvReader, err := newCSVReaderByConfig(config, file)
+
+		if err != nil {
+			if err == xopen.ErrNoContent {
+				log.Warningf("csvtk pretty: skipping empty input file: %s", file)
+				return
+			}
+			checkError(err)
+		}
+
+		// very important.
+		// If FieldsPerRecord is negative, no check is made and
+		// records may have a variable number of fields.
+		csvReader.Reader.FieldsPerRecord = -1
+
+		csvReader.Run()
+
+		var record []string
+		var n int // number of loaded rows
+		var maxN int
+		var checkedMaxNcols bool
+		var row []string
+		var ncol int
+		var empty []string
+		for chunk := range csvReader.Ch {
+			checkError(chunk.Err)
+
+			for _, record = range chunk.Data {
+				n++
+
+				if readAll {
+					buf = append(buf, record)
+					continue
+				}
+
+				buf = append(buf, record)
+				if !checkedMaxNcols {
+					if n == bufRows {
+						maxN = maxNcols(buf)
+						log.Infof("the maximum number of columns in first %d rows: %d", bufRows, maxN)
+						checkedMaxNcols = true
+						empty = make([]string, maxN)
+
+						for _, row = range buf {
+							ncol = len(row)
+							if ncol < maxN {
+								row = append(row, empty[0:maxN-ncol]...)
+							}
+							writer.Write(row)
+						}
+					}
+
+					continue
+				}
+
+				ncol = len(record)
+				if ncol > maxN {
+					checkError(fmt.Errorf("line %d: the number of columns is larger than %d, please increase the value of -n/--buf-rows (%d)", n, maxN, bufRows))
+				} else if ncol < maxN {
+					record = append(record, empty[0:maxN-ncol]...)
+				}
+				writer.Write(record)
+			}
+		}
+
+		if readAll || !checkedMaxNcols {
+			maxN = maxNcols(buf)
+			empty = make([]string, maxN)
+			log.Infof("the maximum number of columns in all %d rows: %d", len(buf), maxN)
+
+			for _, row = range buf {
+				ncol = len(row)
+				if ncol < maxN {
+					row = append(row, empty[0:maxN-ncol]...)
+				}
+				writer.Write(row)
+			}
+		}
+
+		writer.Flush()
+		checkError(writer.Error())
+
+		readerReport(&config, csvReader, file)
+	},
+}
+
+func maxNcols(buf [][]string) int {
+	maxN := 0
+	var ncol int
+	for _, row := range buf {
+		ncol = len(row)
+		if ncol > maxN {
+			maxN = ncol
+		}
+	}
+	return maxN
+}
+
+func init() {
+	RootCmd.AddCommand(fixCmd)
+
+	fixCmd.Flags().IntP("buf-rows", "n", 0, "the number of rows to determine the maximum number of columns. 0 for all rows.")
+}
diff --git a/doc/docs/usage.md b/doc/docs/usage.md
index d23313e..8874d9e 100644
--- a/doc/docs/usage.md
+++ b/doc/docs/usage.md
@@ -59,6 +59,7 @@
 
 **Edit**
 
+- [fix](#fix)
 - [add-header](#add-header)
 - [del-header](#del-header)
 - [rename](#rename)
@@ -2494,7 +2495,67 @@ b,c,d
 a,b,c,d
 
 ```
-        
+
+## fix
+
+Usage
+
+```text
+fix CSV/TSV with different numbers of columns in rows
+
+How to:
+  1. First -n/--buf-rows rows are read to check the maximum number of columns.
+     The default value 0 means all rows will be read.
+  2. Buffered and remaining rows with fewer columns are appended with empty
+     cells before output.
+  3. An error will be reported if the number of columns of any remaining row
+     is larger than the maximum number of columns.
+
+Usage:
+  csvtk fix [flags]
+
+Flags:
+  -n, --buf-rows int   the number of rows to determine the maximum number of columns. 0 for all rows.
+  -h, --help           help for fix
+
+
+```
+
+Examples
+
+```
+$ cat testdata/unequal_ncols.csv
+id,first_name,last_name
+11,"Rob","Pike"
+2,Ken,Thompson
+4,"Robert","Griesemer","gri"
+1,"Robert","Thompson","abc"
+NA,"Robert"
+
+
+$ cat testdata/unequal_ncols.csv | csvtk pretty
+[ERRO] record on line 4: wrong number of fields
+
+
+
+$ cat testdata/unequal_ncols.csv | csvtk fix | csvtk pretty -S grid
+[INFO] the maximum number of columns in all 6 rows: 4
++----+------------+-----------+-----+
+| id | first_name | last_name |     |
++====+============+===========+=====+
+| 11 | Rob        | Pike      |     |
++----+------------+-----------+-----+
+| 2  | Ken        | Thompson  |     |
++----+------------+-----------+-----+
+| 4  | Robert     | Griesemer | gri |
++----+------------+-----------+-----+
+| 1  | Robert     | Thompson  | abc |
++----+------------+-----------+-----+
+| NA | Robert     |           |     |
++----+------------+-----------+-----+
+
+```
+
 ## add-header
 
 Usage
diff --git a/testdata/unequal_ncols.csv b/testdata/unequal_ncols.csv
new file mode 100644
index 0000000..0e488ba
--- /dev/null
+++ b/testdata/unequal_ncols.csv
@@ -0,0 +1,6 @@
+id,first_name,last_name
+11,"Rob","Pike"
+2,Ken,Thompson
+4,"Robert","Griesemer","gri"
+1,"Robert","Thompson","abc"
+NA,"Robert"