diff --git a/io/transform/gsheet2csv/LICENSE b/io/transform/gsheet2csv/LICENSE new file mode 100644 index 0000000..e55db06 --- /dev/null +++ b/io/transform/gsheet2csv/LICENSE @@ -0,0 +1,7 @@ +Authored in 2025 by AJ ONeal +To the extent possible under law, the author(s) have dedicated all copyright +and related and neighboring rights to this software to the public domain +worldwide. This software is distributed without any warranty. + +You should have received a copy of the CC0 Public Domain Dedication along with +this software. If not, see . diff --git a/io/transform/gsheet2csv/README.md b/io/transform/gsheet2csv/README.md new file mode 100644 index 0000000..b498f8f --- /dev/null +++ b/io/transform/gsheet2csv/README.md @@ -0,0 +1,125 @@ +# gsheet2csv + +[![Go Reference](https://pkg.go.dev/badge/github.com/therootcompany/golib/io/transform/gsheet2csv.svg)](https://pkg.go.dev/github.com/therootcompany/golib/io/transform/gsheet2csv) + +A simple wrapper around `encoding/csv` to read Google Sheet CSVs from URL, or a given Reader. + +This does surprisingly little - you should probably just handle the boilerplate yourself. However, these are the problems it solves for us: + +- works with Google Sheet URLs, regardless of URL format + - Edit URL: + - Share URL (Sheet 1): + - CSV Export URL: + - anything with a path like `/spreadsheets/d/{docid}/` and (optionally) a hash or query param like `gid={gid}` +- can write out for import to gsheet (comments containing quotes or commas are quoted), \ + or in RFC form (comments are never quoted, but values beginning with a comment character are) +- swaps `\r` (Windows) for `\n` (Unix) and ensures trailing newline (a la `encoding/csv`) + +Note: + +- The Google Sheet must be shared to **Anyone with the link**. +- Read and write in 'gsheet' style for reciprocity of comment handling +- Be careful about single-column CSVs \ + (all comment-like lines are comments, same as with `encoding/csv` and empty lines) + +# Usage + +Same as `encoding/csv` (embedded), but with two extra options: + +```go +package main + +import ( + "fmt" + "os" + + "github.com/therootcompany/golib/io/transform/gsheet2csv" +) + +func main() { + switch len(os.Args) { + case 2: + break + case 1: + fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) + os.Exit(1) + } + urlOrPath := os.Args[1] + + gsr := gsheet2csv.NewReaderFrom(urlOrPath) + records, err := gsr.ReadAll() + if err != nil { + fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err) + os.Exit(1) + } + + csvw := gsheet2csv.NewWriter(os.Stdout) + csvw.Comment = gsr.Comment + if err := csvw.WriteAll(records); err != nil { + fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err) + os.Exit(1) + } +} +``` + +# CLI + +There are a few convenience utilities: + +- `gsheet2csv` (also `gsheet2tsv`) +- `gsheet2env` + +## gsheet2csv + +They're only slightly different from a direct export of a Google CSV in that they reformat comments and newlines. + +The alterable behavior is almost exclusively for testing. + +### Installation + +```sh +go get github.com/therootcompany/golib/io/transform/gsheet2csv +``` + +### Usage + +```sh +gsheet2csv -raw -o ./gsheet.csv 'https://docs.google.com/spreadsheets/...' + +gsheet2csv -d '\t' --write-style 'gsheet' ./gsheet.csv > ./gsheet.tsv + +gsheet2csv --strip-comments ./gsheet.csv > ./sheet.csv +``` + +```text +--raw download without processing +--print-ids print ids to stdout without download +--print-url print url to stdout without downloading +-o write records to file (default: stdout) +-d field delimiter (for output) +--read-delimiter input field delimiter (for testing reciprocity) +--crlf write using CRLF (\r\n) as the record separator +--comment '#' treat lines starting with # as comments +--strip-comments ignore single-field data beginning with a comment character +--read-style 'gsheet' (preserves comments as single-field records) + or 'rfc' (ignore lines starting with comment character) +--write-style 'gsheet' (quote single-field comments containing quotes or commas) + or 'rfc' (only quote values starting with a comment character) +``` + +### ASCII Delimiters + +``` +, comma +\t tab (or a normal tab) + space (just a normal space) +: colon +; semicolon +| pipe +^_ unit separator +^^ record separator +^] group separator +^\ file separator +\f form feed (also ^L) +\v vertical tab (also ^K) +``` diff --git a/io/transform/gsheet2csv/cmd/gsheet2csv/main.go b/io/transform/gsheet2csv/cmd/gsheet2csv/main.go new file mode 100644 index 0000000..1170d3f --- /dev/null +++ b/io/transform/gsheet2csv/cmd/gsheet2csv/main.go @@ -0,0 +1,220 @@ +package main + +import ( + "encoding/csv" + "errors" + "flag" + "fmt" + "io" + "os" + "strings" + "unicode/utf8" + + "github.com/therootcompany/golib/io/transform/gsheet2csv" +) + +type CSVReader interface { + Read() ([]string, error) + ReadAll() ([][]string, error) +} + +type CSVWriter interface { + Write([]string) error + WriteAll([][]string) error + Flush() + Error() error +} + +func main() { + var commentArg string + format := "CSV" + delim := ',' + if strings.Contains(os.Args[0], "tsv") { + delim = '\t' + format = "TSV" + } + + // Parse command-line flags + flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments, 0 to disable (which may cause read errors)") + outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)") + readDelimString := flag.String("read-delimiter", ",", "field delimiter to use for input file ('\\t' for tab, '^_' for Unit Separator, etc)") + delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)") + useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator") + urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL") + parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)") + rawOnly := flag.Bool("raw", false, "don't parse, just download") + noReadComments := flag.Bool("strip-comments", false, "strip comments when reading (gsheet-only, control rfc behavior with --comment)") + readStyle := flag.String("read-style", "gsheet", "'gsheet' or 'rfc' to read either as a gsheet or rfc CSV") + writeStyle := flag.String("write-style", "rfc", "'gsheet' or 'rfc' to write either for gsheet import or rfc CSV read") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: %s [flags] \n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format) + fmt.Fprintf(os.Stderr, "Flags:\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nExample:\n") + fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0]) + } + flag.Parse() + + // Check for URL argument + if len(flag.Args()) != 1 { + fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n") + flag.Usage() + os.Exit(1) + } + url := flag.Args()[0] + + // Prepare output writer + var out *os.File + if *outputFile != "" { + var err error + out, err = os.Create(*outputFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err) + os.Exit(1) + } + defer func() { _ = out.Close() }() + } else { + out = os.Stdout + } + + inputDelim, err := gsheet2csv.DecodeDelimiter(*readDelimString) + if err != nil { + fmt.Fprintf(os.Stderr, "Error decoding input delimiter: %v\n", err) + os.Exit(1) + } + + delim, err = gsheet2csv.DecodeDelimiter(*delimString) + if err != nil { + fmt.Fprintf(os.Stderr, "Error decoding output delimiter: %v\n", err) + os.Exit(1) + } + + var rc io.ReadCloser + if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") { + docid, gid := gsheet2csv.ParseIDs(url) + if *parseOnly { + fmt.Printf("docid=%s\ngid=%s\n", docid, gid) + } else { + fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid) + } + + sheetURL := gsheet2csv.ToCSVURL(docid, gid) + if *urlOnly { + fmt.Printf("%s\n", sheetURL) + } else { + fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL) + } + + if !*urlOnly { + resp, err := gsheet2csv.GetSheet(docid, gid) + if err != nil { + fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err) + os.Exit(1) + } + defer func() { _ = resp.Body.Close() }() + rc = resp.Body + } + } else { + url = strings.TrimPrefix(url, "file://") + fmt.Fprintf(os.Stderr, "opening %s\n", url) + f, err := os.Open(url) + if err != nil { + fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err) + os.Exit(1) + } + rc = f + } + + if out == os.Stdout { + fmt.Fprintf(os.Stderr, "\n") + } + + if *urlOnly || *parseOnly { + os.Exit(0) + return + } + + if *rawOnly { + if _, err := io.Copy(out, rc); err != nil { + fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err) + os.Exit(1) + } + return + } + + var comment rune + if commentArg == "0" { + comment = 0 + } else { + comment, _ = utf8.DecodeRuneInString(commentArg) + } + + // Create a reader for the Google Sheet + var csvr CSVReader + if *readStyle == "rfc" { + rfcr := csv.NewReader(rc) + rfcr.Comma = inputDelim + rfcr.Comment = comment + rfcr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not + csvr = rfcr + } else { + gsr := gsheet2csv.NewReader(rc) + gsr.Comma = inputDelim + if *noReadComments { + gsr.Comment = comment + } else { + gsr.Comment = 0 + } + gsr.ReuseRecord = true + csvr = gsr + } + + // Create CSV writer + var csvw CSVWriter + // if *writeStyle == "gsheet" + { + gsw := gsheet2csv.NewWriter(out) + gsw.QuoteAmbiguousComments = *writeStyle == "gsheet" + gsw.Comment = comment + gsw.Comma = delim // Set delimiter to tab for TSV + gsw.UseCRLF = *useCRLF + csvw = gsw + } + // else { + // rfcw := csv.NewWriter(out) + // rfcw.Comma = delim + // rfcw.UseCRLF = *useCRLF + // csvw = rfcw + // } + + for { + // Convert each record + record, err := csvr.Read() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err) + os.Exit(1) + return + } + + if err := csvw.Write(record); err != nil { + fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err) + os.Exit(1) + return + } + } + csvw.Flush() + if err := csvw.Error(); err != nil { + fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err) + os.Exit(1) + } + + if out != os.Stdout { + fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile) + } +} diff --git a/io/transform/gsheet2csv/cmd/gsheet2tsv/main.go b/io/transform/gsheet2csv/cmd/gsheet2tsv/main.go new file mode 100644 index 0000000..1170d3f --- /dev/null +++ b/io/transform/gsheet2csv/cmd/gsheet2tsv/main.go @@ -0,0 +1,220 @@ +package main + +import ( + "encoding/csv" + "errors" + "flag" + "fmt" + "io" + "os" + "strings" + "unicode/utf8" + + "github.com/therootcompany/golib/io/transform/gsheet2csv" +) + +type CSVReader interface { + Read() ([]string, error) + ReadAll() ([][]string, error) +} + +type CSVWriter interface { + Write([]string) error + WriteAll([][]string) error + Flush() + Error() error +} + +func main() { + var commentArg string + format := "CSV" + delim := ',' + if strings.Contains(os.Args[0], "tsv") { + delim = '\t' + format = "TSV" + } + + // Parse command-line flags + flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments, 0 to disable (which may cause read errors)") + outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)") + readDelimString := flag.String("read-delimiter", ",", "field delimiter to use for input file ('\\t' for tab, '^_' for Unit Separator, etc)") + delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)") + useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator") + urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL") + parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)") + rawOnly := flag.Bool("raw", false, "don't parse, just download") + noReadComments := flag.Bool("strip-comments", false, "strip comments when reading (gsheet-only, control rfc behavior with --comment)") + readStyle := flag.String("read-style", "gsheet", "'gsheet' or 'rfc' to read either as a gsheet or rfc CSV") + writeStyle := flag.String("write-style", "rfc", "'gsheet' or 'rfc' to write either for gsheet import or rfc CSV read") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: %s [flags] \n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format) + fmt.Fprintf(os.Stderr, "Flags:\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nExample:\n") + fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0]) + } + flag.Parse() + + // Check for URL argument + if len(flag.Args()) != 1 { + fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n") + flag.Usage() + os.Exit(1) + } + url := flag.Args()[0] + + // Prepare output writer + var out *os.File + if *outputFile != "" { + var err error + out, err = os.Create(*outputFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err) + os.Exit(1) + } + defer func() { _ = out.Close() }() + } else { + out = os.Stdout + } + + inputDelim, err := gsheet2csv.DecodeDelimiter(*readDelimString) + if err != nil { + fmt.Fprintf(os.Stderr, "Error decoding input delimiter: %v\n", err) + os.Exit(1) + } + + delim, err = gsheet2csv.DecodeDelimiter(*delimString) + if err != nil { + fmt.Fprintf(os.Stderr, "Error decoding output delimiter: %v\n", err) + os.Exit(1) + } + + var rc io.ReadCloser + if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") { + docid, gid := gsheet2csv.ParseIDs(url) + if *parseOnly { + fmt.Printf("docid=%s\ngid=%s\n", docid, gid) + } else { + fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid) + } + + sheetURL := gsheet2csv.ToCSVURL(docid, gid) + if *urlOnly { + fmt.Printf("%s\n", sheetURL) + } else { + fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL) + } + + if !*urlOnly { + resp, err := gsheet2csv.GetSheet(docid, gid) + if err != nil { + fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err) + os.Exit(1) + } + defer func() { _ = resp.Body.Close() }() + rc = resp.Body + } + } else { + url = strings.TrimPrefix(url, "file://") + fmt.Fprintf(os.Stderr, "opening %s\n", url) + f, err := os.Open(url) + if err != nil { + fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err) + os.Exit(1) + } + rc = f + } + + if out == os.Stdout { + fmt.Fprintf(os.Stderr, "\n") + } + + if *urlOnly || *parseOnly { + os.Exit(0) + return + } + + if *rawOnly { + if _, err := io.Copy(out, rc); err != nil { + fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err) + os.Exit(1) + } + return + } + + var comment rune + if commentArg == "0" { + comment = 0 + } else { + comment, _ = utf8.DecodeRuneInString(commentArg) + } + + // Create a reader for the Google Sheet + var csvr CSVReader + if *readStyle == "rfc" { + rfcr := csv.NewReader(rc) + rfcr.Comma = inputDelim + rfcr.Comment = comment + rfcr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not + csvr = rfcr + } else { + gsr := gsheet2csv.NewReader(rc) + gsr.Comma = inputDelim + if *noReadComments { + gsr.Comment = comment + } else { + gsr.Comment = 0 + } + gsr.ReuseRecord = true + csvr = gsr + } + + // Create CSV writer + var csvw CSVWriter + // if *writeStyle == "gsheet" + { + gsw := gsheet2csv.NewWriter(out) + gsw.QuoteAmbiguousComments = *writeStyle == "gsheet" + gsw.Comment = comment + gsw.Comma = delim // Set delimiter to tab for TSV + gsw.UseCRLF = *useCRLF + csvw = gsw + } + // else { + // rfcw := csv.NewWriter(out) + // rfcw.Comma = delim + // rfcw.UseCRLF = *useCRLF + // csvw = rfcw + // } + + for { + // Convert each record + record, err := csvr.Read() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err) + os.Exit(1) + return + } + + if err := csvw.Write(record); err != nil { + fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err) + os.Exit(1) + return + } + } + csvw.Flush() + if err := csvw.Error(); err != nil { + fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err) + os.Exit(1) + } + + if out != os.Stdout { + fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile) + } +} diff --git a/io/transform/gsheet2csv/fixtures/example.go b/io/transform/gsheet2csv/fixtures/example.go new file mode 100644 index 0000000..ccbc2d3 --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/example.go @@ -0,0 +1,33 @@ +package main + +import ( + "fmt" + "os" + + "github.com/therootcompany/golib/io/transform/gsheet2csv" +) + +func main() { + switch len(os.Args) { + case 2: + break + case 1: + fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) + os.Exit(1) + } + urlOrPath := os.Args[1] + + gsr := gsheet2csv.NewReaderFrom(urlOrPath) + records, err := gsr.ReadAll() + if err != nil { + fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err) + os.Exit(1) + } + + csvw := gsheet2csv.NewWriter(os.Stdout) + csvw.Comment = gsr.Comment + if err := csvw.WriteAll(records); err != nil { + fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err) + os.Exit(1) + } +} diff --git a/io/transform/gsheet2csv/fixtures/gsheet-raw.csv b/io/transform/gsheet2csv/fixtures/gsheet-raw.csv new file mode 100644 index 0000000..571b4ae --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/gsheet-raw.csv @@ -0,0 +1,18 @@ +# this is a comment,, +"# this is, well, a quoted comment",, +"# this is a ""super""-quoted comment",, +Key,Value, +Name,55, +Girlfriend's Age,55, +,, +My IQ,55, +,55, +"Key,with,Comma",, +,"Value,with,Comma", +"Quoted ""Key""",Normal Value, +Normal Key,"Quoted ""Value""", +"Quoted ""Key""",, +,"Quoted ""Value""", +x,y,z +"# comment with trailing comma,",, +#1,2,#3 \ No newline at end of file diff --git a/io/transform/gsheet2csv/fixtures/gsheet-stripped.csv b/io/transform/gsheet2csv/fixtures/gsheet-stripped.csv new file mode 100644 index 0000000..dff80d3 --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/gsheet-stripped.csv @@ -0,0 +1,14 @@ +Key,Value, +Name,55, +Girlfriend's Age,55, +,, +My IQ,55, +,55, +"Key,with,Comma",, +,"Value,with,Comma", +"Quoted ""Key""",Normal Value, +Normal Key,"Quoted ""Value""", +"Quoted ""Key""",, +,"Quoted ""Value""", +x,y,z +"#1",2,#3 diff --git a/io/transform/gsheet2csv/fixtures/gsheet-to-gsheet.csv b/io/transform/gsheet2csv/fixtures/gsheet-to-gsheet.csv new file mode 100644 index 0000000..f1627e1 --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/gsheet-to-gsheet.csv @@ -0,0 +1,18 @@ +# this is a comment +"# this is, well, a quoted comment" +"# this is a ""super""-quoted comment" +Key,Value, +Name,55, +Girlfriend's Age,55, +,, +My IQ,55, +,55, +"Key,with,Comma",, +,"Value,with,Comma", +"Quoted ""Key""",Normal Value, +Normal Key,"Quoted ""Value""", +"Quoted ""Key""",, +,"Quoted ""Value""", +x,y,z +"# comment with trailing comma," +"#1",2,#3 diff --git a/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.csv b/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.csv new file mode 100644 index 0000000..a2dc32b --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.csv @@ -0,0 +1,18 @@ +# this is a comment +# this is, well, a quoted comment +# this is a "super"-quoted comment +Key,Value, +Name,55, +Girlfriend's Age,55, +,, +My IQ,55, +,55, +"Key,with,Comma",, +,"Value,with,Comma", +"Quoted ""Key""",Normal Value, +Normal Key,"Quoted ""Value""", +"Quoted ""Key""",, +,"Quoted ""Value""", +x,y,z +# comment with trailing comma, +"#1",2,#3 diff --git a/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.tsv b/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.tsv new file mode 100644 index 0000000..ae18dab --- /dev/null +++ b/io/transform/gsheet2csv/fixtures/gsheet-to-rfc.tsv @@ -0,0 +1,18 @@ +# this is a comment +# this is, well, a quoted comment +# this is a "super"-quoted comment +Key Value +Name 55 +Girlfriend's Age 55 + +My IQ 55 + 55 +Key,with,Comma + Value,with,Comma +"Quoted ""Key""" Normal Value +Normal Key "Quoted ""Value""" +"Quoted ""Key""" + "Quoted ""Value""" +x y z +# comment with trailing comma, +"#1" 2 #3 diff --git a/io/transform/gsheet2csv/go.mod b/io/transform/gsheet2csv/go.mod new file mode 100644 index 0000000..f57d483 --- /dev/null +++ b/io/transform/gsheet2csv/go.mod @@ -0,0 +1,3 @@ +module github.com/therootcompany/golib/io/transform/gsheet2csv + +go 1.24.6 diff --git a/io/transform/gsheet2csv/gsheet2csv.go b/io/transform/gsheet2csv/gsheet2csv.go new file mode 100644 index 0000000..ac9d962 --- /dev/null +++ b/io/transform/gsheet2csv/gsheet2csv.go @@ -0,0 +1,309 @@ +// Authored in 2025 by AJ ONeal (https://therootcompany.com) +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along with +// this software. If not, see . +// +// SPDX-License-Identifier: CC0-1.0 + +package gsheet2csv + +import ( + "encoding/csv" + "errors" + "fmt" + "io" + "net/http" + "os" + "strings" + "unicode/utf8" +) + +const ( + fileSeparator = '\x1c' + groupSeparator = '\x1d' + recordSeparator = '\x1e' + unitSeparator = '\x1f' +) + +var ErrHTTPGet = errors.New("did not get 200 OK when downloading from URL") + +// For mocking for tests +var httpGet = http.Get + +type Reader struct { + *csv.Reader + DocID string + GID string + URL string + Comment rune + r io.Reader + resp *http.Response + close bool + err error +} + +func NewReaderFrom(urlOrPath string) *Reader { + if strings.HasPrefix(urlOrPath, "https://") || strings.HasPrefix(urlOrPath, "http://") { + return NewReaderFromURL(urlOrPath) + } + + urlOrPath = strings.TrimPrefix(urlOrPath, "file://") + f, err := os.Open(urlOrPath) + r := NewReader(f) + r.URL = urlOrPath + if err != nil { + r.err = err + } + + return r +} + +func NewReaderFromURL(url string) *Reader { + docid, gid := ParseIDs(url) + + return NewReaderFromIDs(docid, gid) +} + +func NewReaderFromIDs(docid, gid string) *Reader { + resp, err := GetSheet(docid, gid) + if err != nil { + r := NewReader(nil) + r.err = err + return r + } + + r := NewReader(resp.Body) + r.URL = ToCSVURL(docid, gid) + r.DocID = docid + r.GID = gid + r.resp = resp + r.close = true + return r +} + +func ToCSVURL(docid, gid string) string { + return fmt.Sprintf("https://docs.google.com/spreadsheets/d/%s/export?format=csv&usp=sharing&gid=%s", docid, gid) +} + +func GetSheet(docid, gid string) (*http.Response, error) { + downloadURL := ToCSVURL(docid, gid) + + resp, err := httpGet(downloadURL) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + _ = resp.Body.Close() + return nil, ErrHTTPGet + } + + return resp, nil +} + +func NewReader(r io.Reader) *Reader { + csvr := csv.NewReader(r) + csvr.Comma = ',' + csvr.Comment = 0 // to allow distinguishing between quoted comments and fields + csvr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not + csvr.LazyQuotes = false // fields that need quotes use them correctly + csvr.TrimLeadingSpace = false + csvr.ReuseRecord = false + return &Reader{ + Reader: csvr, + Comment: '#', + r: r, + } +} + +func DecodeDelimiter(delimString string) (rune, error) { + switch delimString { + case "^_", "\\x1f": + delimString = string(unitSeparator) + case "^^", "\\x1e": + delimString = string(recordSeparator) + case "^]", "\\x1d": + delimString = string(groupSeparator) + case "^\\", "\\x1c": + delimString = string(fileSeparator) + case "^L", "\\f": + delimString = "\f" + case "^K", "\\v": + delimString = "\v" + case "^I", "\\t": + delimString = " " + } + delim, _ := utf8.DecodeRuneInString(delimString) + return delim, nil +} + +func (r *Reader) Read() ([]string, error) { + if r.err != nil { + return nil, r.err + } + + for { + record, err := r.Reader.Read() + if err != nil { + if r.close { + _ = r.resp.Body.Close() + } + return nil, err + } + + if r.Comment > 0 { + if rv, _ := utf8.DecodeRuneInString(record[0]); rv == r.Comment { + last := len(record) - 1 + for len(record[last]) == 0 { + last -= 1 + } + if last == 0 { + continue + } + } + } + return record, nil + } +} + +func (r *Reader) ReadAll() ([][]string, error) { + var records [][]string + + for { + record, err := r.Read() + if nil != err { + if errors.Is(err, io.EOF) { + return records, nil + } + return records, err + } + records = append(records, record) + } +} + +func ParseIDs(urlStr string) (docid string, gid string) { + // Find key: look for /spreadsheets/d/{key} + const prefix = "/spreadsheets/d/" + startIdx := strings.Index(urlStr, prefix) + if startIdx == -1 { + return "", gid + } + startIdx += len(prefix) + + // Find end of key (next / or end of string) + endIdx := strings.Index(urlStr[startIdx:], "/") + if endIdx == -1 { + endIdx = len(urlStr) + } else { + endIdx += startIdx + } + + docid = urlStr[startIdx:endIdx] + if docid == "" { + return "", "" + } + + // Find gid: look for gid= and take until #, &, ?, /, or end + gidIdx := strings.Index(urlStr, "gid=") + if gidIdx != -1 { + gidStart := gidIdx + len("gid=") + endChars := "#&?/" + gidEnd := strings.IndexAny(urlStr[gidStart:], endChars) + if gidEnd == -1 { + gid = urlStr[gidStart:] + } else { + gid = urlStr[gidStart : gidStart+gidEnd] + } + } + + if len(gid) == 0 { + gid = "0" + } + return docid, gid +} + +type Writer struct { + *csv.Writer + Comment rune + QuoteAmbiguousComments bool + w io.Writer +} + +func NewWriter(w io.Writer) *Writer { + return &Writer{ + Writer: csv.NewWriter(w), + Comment: '#', + w: w, + } +} + +func (w *Writer) Write(record []string) error { + // Not handling comments? Move along. + if w.Comment == 0 || len(record) == 0 { + return w.Writer.Write(record) + } + + // First char not a comment char? Move along. + if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 != w.Comment { + return w.Writer.Write(record) + } + + // Is this a true comment? Or data that should be quoted that begins with the comment char? + lastNonEmpty := len(record) - 1 + if lastNonEmpty > -1 { + for len(record[lastNonEmpty]) == 0 { + lastNonEmpty -= 1 + } + } + + // We will be doing custom writes ahead + w.Flush() + var newline = "\n" + if w.UseCRLF { + newline = "\r\n" + } + + // Write true comments out plain + first := 0 + if lastNonEmpty == 0 { + record = record[:1] + if !w.QuoteAmbiguousComments { + if _, err := w.w.Write([]byte(record[0] + newline)); err != nil { + return err + } + return nil + } + // Quote the comment iff it contains quotes or commas, not universally + first = -1 + } + + // Quote if + // - the line contains quotes or commas + // - there are multiple fields and the first starts with a comment character + // (but NOT a single-field comment with no quotes or commas) + for i, f := range record { + if i == first || strings.Contains(f, `"`) || strings.Contains(f, string(w.Comma)) { + f = strings.ReplaceAll(f, `"`, `""`) + record[i] = `"` + f + `"` + } + } + line := strings.Join(record, string(w.Comma)) + if _, err := w.w.Write([]byte(line + newline)); err != nil { + return err + } + return nil +} + +func (w *Writer) WriteAll(records [][]string) error { + for _, r := range records { + if err := w.Write(r); err != nil { + return err + } + } + w.Flush() + return w.Error() +} diff --git a/io/transform/gsheet2csv/gsheet2csv_test.go b/io/transform/gsheet2csv/gsheet2csv_test.go new file mode 100644 index 0000000..da706a0 --- /dev/null +++ b/io/transform/gsheet2csv/gsheet2csv_test.go @@ -0,0 +1,249 @@ +package gsheet2csv + +import ( + "errors" + "io" + "net/http" + "slices" + "strings" + "testing" +) + +// mockHTTPClient allows controlling HTTP responses for testing. +type mockHTTPClient struct { + resp *http.Response + err error +} + +func (m *mockHTTPClient) Get(url string) (*http.Response, error) { + return m.resp, m.err +} + +// sampleCSV mimics the structure of ai-models.csv from the project README. +const sampleCSV = `# Generated by ollama list +"# Sample Quoted Comment, with ""quotes"" itself" +"NAME","ID","SIZE","MODIFIED" +"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago" +"gpt-oss:20b","aa4295ac10c3","13 GB","8 days ago" + +"gpt-oss:latest","aa4295ac10c3","13 GB","7 weeks ago" +` + +// malformedCSV for testing error handling. +const malformedCSV = `# Comment +"NAME","ID","SIZE","MODIFIED +"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago" +` + +// TestParseIDs verifies the ParseIDs function for various URL formats. +func TestParseIDs(t *testing.T) { + tests := []struct { + name string + url string + wantDoc string + wantGid string + }{ + { + name: "Google Sheets Edit / Share URL with gid", + url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238", + wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34", + wantGid: "559037238", + }, + { + name: "Google Sheets CSV URL with gid", + url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/export?format=csv&usp=sharing&gid=559037238", + wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34", + wantGid: "559037238", + }, + { + name: "URL without gid", + url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit", + wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34", + wantGid: "0", + }, + { + name: "Invalid URL", + url: "https://example.com/invalid", + wantDoc: "", + wantGid: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotDoc, gotGid := ParseIDs(tt.url) + if gotDoc != tt.wantDoc { + t.Errorf("ParseIDs() docid = %q, want %q", gotDoc, tt.wantDoc) + } + if gotGid != tt.wantGid { + t.Errorf("ParseIDs() gid = %q, want %q", gotGid, tt.wantGid) + } + }) + } +} + +// TestNewReaderFromURL tests initializing a Reader from a Google Sheets URL. +func TestNewReaderFromURL(t *testing.T) { + originalGet := httpGet + defer func() { httpGet = originalGet }() + + url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238" + + // Test successful HTTP response + mockResp := &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(sampleCSV)), + } + client := &mockHTTPClient{resp: mockResp} + httpGet = client.Get + + reader := NewReaderFromURL(url) + if reader.err != nil { + t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err) + } + if reader.resp != mockResp { + t.Error("NewReaderFromURL() did not set response correctly") + } + if !reader.close { + t.Error("NewReaderFromURL() did not set close flag") + } + + // Test HTTP failure + client = &mockHTTPClient{resp: mockResp} + client.err = errors.New("network error") + httpGet = client.Get + + reader = NewReaderFromURL(url) + if reader.err == nil { + t.Error("NewReaderFromURL() expected error, got nil") + } + + // Test non-200 status + client = &mockHTTPClient{resp: &http.Response{ + StatusCode: http.StatusNotFound, + Body: io.NopCloser(strings.NewReader("these aren't the droids you're looking for")), + }} + httpGet = client.Get + + reader = NewReaderFromURL(url) + if reader.err == nil { + t.Error("NewReaderFromURL() expected error for non-200 status, got nil") + } +} + +// TestRead tests the Read method for comment handling. +func TestRead(t *testing.T) { + tests := []struct { + name string + preserveComments bool + expected [][]string + }{ + { + name: "Skip comments", + expected: [][]string{ + {"NAME", "ID", "SIZE", "MODIFIED"}, + {"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"}, + {"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"}, + {"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"}, + }, + }, + { + name: "Don't skip comments", + preserveComments: true, + expected: [][]string{ + {"# Generated by ollama list"}, + {"# Sample Quoted Comment, with \"quotes\" itself"}, + {"NAME", "ID", "SIZE", "MODIFIED"}, + {"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"}, + {"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"}, + {"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := NewReader(strings.NewReader(sampleCSV)) + if tt.preserveComments { + reader.Comment = 0 + } + + for i, want := range tt.expected { + got, err := reader.Read() + if err != nil { + t.Errorf("Read() error at record %d: %v", i, err) + } + if !slices.Equal(got, want) { + t.Errorf("Read() record %d = %v, want %v", i, got, want) + } + } + + // Verify EOF + _, err := reader.Read() + if !errors.Is(err, io.EOF) { + t.Errorf("Read() expected EOF, got %v", err) + } + }) + } +} + +// TestReadAll tests the ReadAll method for different configurations. +func TestReadAll(t *testing.T) { + tests := []struct { + name string + expected [][]string + }{ + { + name: "Skip comments", + expected: [][]string{ + {"NAME", "ID", "SIZE", "MODIFIED"}, + {"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"}, + {"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"}, + {"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := NewReader(strings.NewReader(sampleCSV)) + + got, err := reader.ReadAll() + if err != nil { + t.Errorf("ReadAll() error: %v", err) + } + if len(got) != len(tt.expected) { + t.Errorf("ReadAll() returned %d records, want %d", len(got), len(tt.expected)) + } + for i, want := range tt.expected { + if !slices.Equal(got[i], want) { + t.Errorf("ReadAll() record %d = %v, want %v", i, got[i], want) + } + } + }) + } +} + +// TestNewReaderFromURLWithMalformedCSV tests NewReaderFromURL with malformed CSV. +func TestNewReaderFromURLWithMalformedCSV(t *testing.T) { + mockResp := &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(malformedCSV)), + } + client := &mockHTTPClient{resp: mockResp} + originalGet := httpGet + httpGet = client.Get + defer func() { httpGet = originalGet }() + + url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238" + reader := NewReaderFromURL(url) + if reader.err != nil { + t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err) + } + + // Reading should fail due to malformed CSV + _, err := reader.Read() + if err == nil { + t.Error("Read() expected error for malformed CSV, got nil") + } +}