diff --git a/io/transforms/gsheet2csv/README.md b/io/transforms/gsheet2csv/README.md index 1796636..5a3884e 100644 --- a/io/transforms/gsheet2csv/README.md +++ b/io/transforms/gsheet2csv/README.md @@ -15,6 +15,8 @@ This does surprisingly little - you should probably just handle the boilerplate - can preserve comments - swaps `\r` (Windows) for `\n` (Unix) and ensures trailing newline (a la `encoding/csv`) +Note: The Google Sheet must be shared to **Anyone with the link**. + ## Usage Same as `encoding/csv` (embedded), but with two extra options: @@ -37,16 +39,15 @@ func main() { fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) os.Exit(1) } - url := os.Args[1] + urlOrPath := os.Args[1] - gsr := gsheet2csv.NewReaderFromURL(url) + gsr := gsheet2csv.NewReaderFrom(urlOrPath) records, err := gsr.ReadAll() if err != nil { - fmt.Fprintf(os.Stderr, "Error reading from %s\n", gsr.URL) + fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err) os.Exit(1) } - // distinguishes between comments and quoted fields csvw := gsheet2csv.NewWriter(os.Stdout) csvw.Comment = gsr.Comment if err := csvw.WriteAll(records); err != nil { diff --git a/io/transforms/gsheet2csv/cmd/gsheet2csv/main.go b/io/transforms/gsheet2csv/cmd/gsheet2csv/main.go index be51af9..b0f2d12 100644 --- a/io/transforms/gsheet2csv/cmd/gsheet2csv/main.go +++ b/io/transforms/gsheet2csv/cmd/gsheet2csv/main.go @@ -1,6 +1,7 @@ package main import ( + "encoding/csv" "errors" "flag" "fmt" @@ -19,8 +20,21 @@ const ( unitSeparator = "\x1f" ) +type CSVReader interface { + Read() ([]string, error) + ReadAll() ([][]string, error) +} + +type CSVWriter interface { + Write([]string) error + WriteAll([][]string) error + Flush() + Error() error +} + func main() { var commentArg string + newline := "\n" format := "CSV" delim := ',' if strings.Contains(os.Args[0], "tsv") { @@ -29,13 +43,17 @@ func main() { } // Parse command-line flags - flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments") + flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments, 0 to disable") outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)") delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)") useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator") urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL") parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)") rawOnly := flag.Bool("raw", false, "don't parse, just download") + noReadComments := flag.Bool("no-read-comments", false, "strip comments when reading (gsheet-only)") + noWriteComments := flag.Bool("no-write-comments", false, "strip comments when writing") + readStyle := flag.String("read-style", "gsheet", "'gsheet' or 'rfc' to read either as a gsheet or rfc CSV") + writeStyle := flag.String("write-style", "rfc", "'gsheet' or 'rfc' to write either as a gsheet or rfc CSV") flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [flags] \n", os.Args[0]) fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format) @@ -88,6 +106,10 @@ func main() { } delim, _ = utf8.DecodeRuneInString(*delimString) + if *useCRLF { + newline = "\r\n" + } + var rc io.ReadCloser if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") { docid, gid := gsheet2csv.ParseIDs(url) @@ -123,7 +145,10 @@ func main() { } rc = f } - fmt.Fprintf(os.Stderr, "\n") + + if out == os.Stdout { + fmt.Fprintf(os.Stderr, "\n") + } if *urlOnly || *parseOnly { os.Exit(0) @@ -138,22 +163,53 @@ func main() { return } - comment, _ := utf8.DecodeRuneInString(commentArg) + var comment rune + if commentArg == "0" { + comment = 0 + } else { + comment, _ = utf8.DecodeRuneInString(commentArg) + } // Create a reader for the Google Sheet - gsr := gsheet2csv.NewReader(rc) - gsr.QuotedComments = false - gsr.Comment = 0 - gsr.ReuseRecord = true + var csvr CSVReader + if *readStyle == "rfc" { + rfcr := csv.NewReader(rc) + rfcr.Comment = comment + csvr = rfcr + } else { + gsr := gsheet2csv.NewReader(rc) + gsr.QuotedComments = false + if *noReadComments { + gsr.Comment = '#' + } else { + gsr.Comment = 0 + } + gsr.ReuseRecord = true + csvr = gsr + } // Create CSV writer - csvw := gsheet2csv.NewWriter(out) - csvw.Comma = delim // Set delimiter to tab for TSV - csvw.Comment = comment - csvw.UseCRLF = *useCRLF + var csvw CSVWriter + if *writeStyle == "gsheet" { + gsw := gsheet2csv.NewWriter(out) + gsw.Comment = comment + gsw.Comma = delim // Set delimiter to tab for TSV + gsw.UseCRLF = *useCRLF + csvw = gsw + } else { + rfcw := csv.NewWriter(out) + rfcw.Comma = delim + rfcw.UseCRLF = *useCRLF + csvw = rfcw + } + + const quote = `"` + const dquote = quote + quote + var commaStr = string(delim) + var commentStr = string(comment) for { // Convert each record - record, err := gsr.Read() + record, err := csvr.Read() if err != nil { if errors.Is(err, io.EOF) { break @@ -162,7 +218,39 @@ func main() { os.Exit(1) } - if err := csvw.Write(record); err != nil { + lastNonEmpty := len(record) - 1 + for lastNonEmpty > -1 && len(record[lastNonEmpty]) == 0 { + lastNonEmpty -= 1 + } + + if *writeStyle != "rfc" || comment == 0 || len(record) == 0 || !strings.HasPrefix(record[0], commentStr) { + if *noWriteComments && len(record) > 0 && strings.HasPrefix(record[0], commentStr) && lastNonEmpty == 0 { + continue + } + + if err := csvw.Write(record); err != nil { + fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err) + os.Exit(1) + } + continue + } + + if lastNonEmpty == 0 { + record = record[:1] + if *noWriteComments { + continue + } + } else { + for i, f := range record { + if i == 0 || strings.Contains(f, quote) || strings.Contains(f, string(commaStr)) { + f = strings.ReplaceAll(f, quote, dquote) + record[i] = quote + f + quote + } + } + } + + csvw.Flush() + if _, err := out.Write([]byte(strings.Join(record, commaStr) + newline)); err != nil { fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err) os.Exit(1) } @@ -173,4 +261,8 @@ func main() { fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err) os.Exit(1) } + + if out != os.Stdout { + fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile) + } } diff --git a/io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go b/io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go index be51af9..55cdfbe 100644 --- a/io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go +++ b/io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go @@ -123,7 +123,10 @@ func main() { } rc = f } - fmt.Fprintf(os.Stderr, "\n") + + if out == os.Stdout { + fmt.Fprintf(os.Stderr, "\n") + } if *urlOnly || *parseOnly { os.Exit(0) @@ -173,4 +176,8 @@ func main() { fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err) os.Exit(1) } + + if out != os.Stdout { + fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile) + } } diff --git a/io/transforms/gsheet2csv/fixtures/example.go b/io/transforms/gsheet2csv/fixtures/example.go index 6bbc48c..ccbc2d3 100644 --- a/io/transforms/gsheet2csv/fixtures/example.go +++ b/io/transforms/gsheet2csv/fixtures/example.go @@ -15,12 +15,12 @@ func main() { fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) os.Exit(1) } - url := os.Args[1] + urlOrPath := os.Args[1] - gsr := gsheet2csv.NewReaderFromURL(url) + gsr := gsheet2csv.NewReaderFrom(urlOrPath) records, err := gsr.ReadAll() if err != nil { - fmt.Fprintf(os.Stderr, "Error reading from %s\n", gsr.URL) + fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err) os.Exit(1) } diff --git a/io/transforms/gsheet2csv/fixtures/sheet.csv b/io/transforms/gsheet2csv/fixtures/sheet.csv new file mode 100644 index 0000000..f1627e1 --- /dev/null +++ b/io/transforms/gsheet2csv/fixtures/sheet.csv @@ -0,0 +1,18 @@ +# this is a comment +"# this is, well, a quoted comment" +"# this is a ""super""-quoted comment" +Key,Value, +Name,55, +Girlfriend's Age,55, +,, +My IQ,55, +,55, +"Key,with,Comma",, +,"Value,with,Comma", +"Quoted ""Key""",Normal Value, +Normal Key,"Quoted ""Value""", +"Quoted ""Key""",, +,"Quoted ""Value""", +x,y,z +"# comment with trailing comma," +"#1",2,#3 diff --git a/io/transforms/gsheet2csv/gsheet2csv.go b/io/transforms/gsheet2csv/gsheet2csv.go index b5841bc..fb9e983 100644 --- a/io/transforms/gsheet2csv/gsheet2csv.go +++ b/io/transforms/gsheet2csv/gsheet2csv.go @@ -17,6 +17,7 @@ import ( "fmt" "io" "net/http" + "os" "strings" "unicode/utf8" ) @@ -39,6 +40,22 @@ type Reader struct { err error } +func NewReaderFrom(urlOrPath string) *Reader { + if strings.HasPrefix(urlOrPath, "https://") || strings.HasPrefix(urlOrPath, "http://") { + return NewReaderFromURL(urlOrPath) + } + + urlOrPath = strings.TrimPrefix(urlOrPath, "file://") + f, err := os.Open(urlOrPath) + r := NewReader(f) + r.URL = urlOrPath + if err != nil { + r.err = err + } + + return r +} + func NewReaderFromURL(url string) *Reader { docid, gid := ParseIDs(url) @@ -85,9 +102,9 @@ func GetSheet(docid, gid string) (*http.Response, error) { func NewReader(r io.Reader) *Reader { csvr := csv.NewReader(r) csvr.Comma = ',' - csvr.Comment = 0 // to allow distinguishing between quoted comments and fields - csvr.FieldsPerRecord = 0 // Google Sheets is consistent - csvr.LazyQuotes = false // fields that need quotes use them correctly + csvr.Comment = 0 // to allow distinguishing between quoted comments and fields + csvr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not + csvr.LazyQuotes = false // fields that need quotes use them correctly csvr.TrimLeadingSpace = false csvr.ReuseRecord = false return &Reader{ @@ -186,8 +203,9 @@ func ParseIDs(urlStr string) (docid string, gid string) { type Writer struct { *csv.Writer - Comment rune - w io.Writer + Comment rune + QuoteComments bool + w io.Writer } func NewWriter(w io.Writer) *Writer { @@ -201,25 +219,31 @@ func NewWriter(w io.Writer) *Writer { func (w *Writer) Write(record []string) error { if len(record) > 1 { if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 == w.Comment { - w.Flush() - lastNonEmpty := len(record) - 1 - for len(record[lastNonEmpty]) == 0 { - lastNonEmpty -= 1 + if lastNonEmpty > -1 { + for len(record[lastNonEmpty]) == 0 { + lastNonEmpty -= 1 + } } + first := 0 if lastNonEmpty == 0 { record = record[:1] - } else { - for i, f := range record { - if i == 0 || strings.Contains(f, `"`) { - f = strings.ReplaceAll(f, `"`, `""`) - record[i] = `"` + f + `"` - } + // if !w.QuoteComments { + // return nil + // } + first = -1 + } + + for i, f := range record { + if i == first || strings.Contains(f, `"`) || strings.Contains(f, string(w.Comma)) { + f = strings.ReplaceAll(f, `"`, `""`) + record[i] = `"` + f + `"` } } line := strings.Join(record, string(w.Comma)) + w.Flush() if _, err := w.w.Write([]byte(line + "\n")); err != nil { return err } @@ -230,6 +254,19 @@ func (w *Writer) Write(record []string) error { return w.Writer.Write(record) } +func QuoteRecord(record []string, comma, comment string) { + const quote = `"` + + for i, f := range record { + if (i == 0 && strings.HasPrefix(f, comment)) || + (strings.Contains(f, quote) || strings.Contains(f, string(comma))) { + + f = strings.ReplaceAll(f, `"`, `""`) + record[i] = `"` + f + `"` + } + } +} + func (w *Writer) WriteAll(records [][]string) error { for _, r := range records { if err := w.Write(r); err != nil {