feat(gsheet2csv): parse URLs and CSVs with comments

This commit is contained in:
AJ ONeal 2025-10-10 18:13:12 -06:00
parent dc951ce388
commit 24ec3f021d
No known key found for this signature in database
13 changed files with 1252 additions and 0 deletions

View File

@ -0,0 +1,7 @@
Authored in 2025 by AJ ONeal <aj@therootcompany.com>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.

View File

@ -0,0 +1,125 @@
# gsheet2csv
[![Go Reference](https://pkg.go.dev/badge/github.com/therootcompany/golib/io/transform/gsheet2csv.svg)](https://pkg.go.dev/github.com/therootcompany/golib/io/transform/gsheet2csv)
A simple wrapper around `encoding/csv` to read Google Sheet CSVs from URL, or a given Reader.
This does surprisingly little - you should probably just handle the boilerplate yourself. However, these are the problems it solves for us:
- works with Google Sheet URLs, regardless of URL format
- Edit URL: <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/edit?gid=0000000000#gid=0000000000>
- Share URL (Sheet 1): <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/edit?usp=sharing>
- CSV Export URL: <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/export?format=csv&usp=sharing&gid=0000000000>
- anything with a path like `/spreadsheets/d/{docid}/` and (optionally) a hash or query param like `gid={gid}`
- can write out for import to gsheet (comments containing quotes or commas are quoted), \
or in RFC form (comments are never quoted, but values beginning with a comment character are)
- swaps `\r` (Windows) for `\n` (Unix) and ensures trailing newline (a la `encoding/csv`)
Note:
- The Google Sheet must be shared to **Anyone with the link**.
- Read and write in 'gsheet' style for reciprocity of comment handling
- Be careful about single-column CSVs \
(all comment-like lines are comments, same as with `encoding/csv` and empty lines)
# Usage
Same as `encoding/csv` (embedded), but with two extra options:
```go
package main
import (
"fmt"
"os"
"github.com/therootcompany/golib/io/transform/gsheet2csv"
)
func main() {
switch len(os.Args) {
case 2:
break
case 1:
fmt.Fprintf(os.Stderr, "Usage: %s <url>\n", os.Args[0])
os.Exit(1)
}
urlOrPath := os.Args[1]
gsr := gsheet2csv.NewReaderFrom(urlOrPath)
records, err := gsr.ReadAll()
if err != nil {
fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err)
os.Exit(1)
}
csvw := gsheet2csv.NewWriter(os.Stdout)
csvw.Comment = gsr.Comment
if err := csvw.WriteAll(records); err != nil {
fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err)
os.Exit(1)
}
}
```
# CLI
There are a few convenience utilities:
- `gsheet2csv` (also `gsheet2tsv`)
- `gsheet2env`
## gsheet2csv
They're only slightly different from a direct export of a Google CSV in that they reformat comments and newlines.
The alterable behavior is almost exclusively for testing.
### Installation
```sh
go get github.com/therootcompany/golib/io/transform/gsheet2csv
```
### Usage
```sh
gsheet2csv -raw -o ./gsheet.csv 'https://docs.google.com/spreadsheets/...'
gsheet2csv -d '\t' --write-style 'gsheet' ./gsheet.csv > ./gsheet.tsv
gsheet2csv --strip-comments ./gsheet.csv > ./sheet.csv
```
```text
--raw download without processing
--print-ids print ids to stdout without download
--print-url print url to stdout without downloading
-o <filepath> write records to file (default: stdout)
-d field delimiter (for output)
--read-delimiter input field delimiter (for testing reciprocity)
--crlf write using CRLF (\r\n) as the record separator
--comment '#' treat lines starting with # as comments
--strip-comments ignore single-field data beginning with a comment character
--read-style 'gsheet' (preserves comments as single-field records)
or 'rfc' (ignore lines starting with comment character)
--write-style 'gsheet' (quote single-field comments containing quotes or commas)
or 'rfc' (only quote values starting with a comment character)
```
### ASCII Delimiters
```
, comma
\t tab (or a normal tab)
space (just a normal space)
: colon
; semicolon
| pipe
^_ unit separator
^^ record separator
^] group separator
^\ file separator
\f form feed (also ^L)
\v vertical tab (also ^K)
```

View File

@ -0,0 +1,220 @@
package main
import (
"encoding/csv"
"errors"
"flag"
"fmt"
"io"
"os"
"strings"
"unicode/utf8"
"github.com/therootcompany/golib/io/transform/gsheet2csv"
)
type CSVReader interface {
Read() ([]string, error)
ReadAll() ([][]string, error)
}
type CSVWriter interface {
Write([]string) error
WriteAll([][]string) error
Flush()
Error() error
}
func main() {
var commentArg string
format := "CSV"
delim := ','
if strings.Contains(os.Args[0], "tsv") {
delim = '\t'
format = "TSV"
}
// Parse command-line flags
flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments, 0 to disable (which may cause read errors)")
outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)")
readDelimString := flag.String("read-delimiter", ",", "field delimiter to use for input file ('\\t' for tab, '^_' for Unit Separator, etc)")
delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)")
useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator")
urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL")
parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)")
rawOnly := flag.Bool("raw", false, "don't parse, just download")
noReadComments := flag.Bool("strip-comments", false, "strip comments when reading (gsheet-only, control rfc behavior with --comment)")
readStyle := flag.String("read-style", "gsheet", "'gsheet' or 'rfc' to read either as a gsheet or rfc CSV")
writeStyle := flag.String("write-style", "rfc", "'gsheet' or 'rfc' to write either for gsheet import or rfc CSV read")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [flags] <google-sheet-url-or-file-path>\n", os.Args[0])
fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format)
fmt.Fprintf(os.Stderr, "Flags:\n")
flag.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExample:\n")
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0])
}
flag.Parse()
// Check for URL argument
if len(flag.Args()) != 1 {
fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n")
flag.Usage()
os.Exit(1)
}
url := flag.Args()[0]
// Prepare output writer
var out *os.File
if *outputFile != "" {
var err error
out, err = os.Create(*outputFile)
if err != nil {
fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err)
os.Exit(1)
}
defer func() { _ = out.Close() }()
} else {
out = os.Stdout
}
inputDelim, err := gsheet2csv.DecodeDelimiter(*readDelimString)
if err != nil {
fmt.Fprintf(os.Stderr, "Error decoding input delimiter: %v\n", err)
os.Exit(1)
}
delim, err = gsheet2csv.DecodeDelimiter(*delimString)
if err != nil {
fmt.Fprintf(os.Stderr, "Error decoding output delimiter: %v\n", err)
os.Exit(1)
}
var rc io.ReadCloser
if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") {
docid, gid := gsheet2csv.ParseIDs(url)
if *parseOnly {
fmt.Printf("docid=%s\ngid=%s\n", docid, gid)
} else {
fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid)
}
sheetURL := gsheet2csv.ToCSVURL(docid, gid)
if *urlOnly {
fmt.Printf("%s\n", sheetURL)
} else {
fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL)
}
if !*urlOnly {
resp, err := gsheet2csv.GetSheet(docid, gid)
if err != nil {
fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err)
os.Exit(1)
}
defer func() { _ = resp.Body.Close() }()
rc = resp.Body
}
} else {
url = strings.TrimPrefix(url, "file://")
fmt.Fprintf(os.Stderr, "opening %s\n", url)
f, err := os.Open(url)
if err != nil {
fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err)
os.Exit(1)
}
rc = f
}
if out == os.Stdout {
fmt.Fprintf(os.Stderr, "\n")
}
if *urlOnly || *parseOnly {
os.Exit(0)
return
}
if *rawOnly {
if _, err := io.Copy(out, rc); err != nil {
fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err)
os.Exit(1)
}
return
}
var comment rune
if commentArg == "0" {
comment = 0
} else {
comment, _ = utf8.DecodeRuneInString(commentArg)
}
// Create a reader for the Google Sheet
var csvr CSVReader
if *readStyle == "rfc" {
rfcr := csv.NewReader(rc)
rfcr.Comma = inputDelim
rfcr.Comment = comment
rfcr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not
csvr = rfcr
} else {
gsr := gsheet2csv.NewReader(rc)
gsr.Comma = inputDelim
if *noReadComments {
gsr.Comment = comment
} else {
gsr.Comment = 0
}
gsr.ReuseRecord = true
csvr = gsr
}
// Create CSV writer
var csvw CSVWriter
// if *writeStyle == "gsheet"
{
gsw := gsheet2csv.NewWriter(out)
gsw.QuoteAmbiguousComments = *writeStyle == "gsheet"
gsw.Comment = comment
gsw.Comma = delim // Set delimiter to tab for TSV
gsw.UseCRLF = *useCRLF
csvw = gsw
}
// else {
// rfcw := csv.NewWriter(out)
// rfcw.Comma = delim
// rfcw.UseCRLF = *useCRLF
// csvw = rfcw
// }
for {
// Convert each record
record, err := csvr.Read()
if err != nil {
if errors.Is(err, io.EOF) {
break
}
fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err)
os.Exit(1)
return
}
if err := csvw.Write(record); err != nil {
fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err)
os.Exit(1)
return
}
}
csvw.Flush()
if err := csvw.Error(); err != nil {
fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err)
os.Exit(1)
}
if out != os.Stdout {
fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile)
}
}

View File

@ -0,0 +1,220 @@
package main
import (
"encoding/csv"
"errors"
"flag"
"fmt"
"io"
"os"
"strings"
"unicode/utf8"
"github.com/therootcompany/golib/io/transform/gsheet2csv"
)
type CSVReader interface {
Read() ([]string, error)
ReadAll() ([][]string, error)
}
type CSVWriter interface {
Write([]string) error
WriteAll([][]string) error
Flush()
Error() error
}
func main() {
var commentArg string
format := "CSV"
delim := ','
if strings.Contains(os.Args[0], "tsv") {
delim = '\t'
format = "TSV"
}
// Parse command-line flags
flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments, 0 to disable (which may cause read errors)")
outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)")
readDelimString := flag.String("read-delimiter", ",", "field delimiter to use for input file ('\\t' for tab, '^_' for Unit Separator, etc)")
delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)")
useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator")
urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL")
parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)")
rawOnly := flag.Bool("raw", false, "don't parse, just download")
noReadComments := flag.Bool("strip-comments", false, "strip comments when reading (gsheet-only, control rfc behavior with --comment)")
readStyle := flag.String("read-style", "gsheet", "'gsheet' or 'rfc' to read either as a gsheet or rfc CSV")
writeStyle := flag.String("write-style", "rfc", "'gsheet' or 'rfc' to write either for gsheet import or rfc CSV read")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [flags] <google-sheet-url-or-file-path>\n", os.Args[0])
fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format)
fmt.Fprintf(os.Stderr, "Flags:\n")
flag.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExample:\n")
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0])
}
flag.Parse()
// Check for URL argument
if len(flag.Args()) != 1 {
fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n")
flag.Usage()
os.Exit(1)
}
url := flag.Args()[0]
// Prepare output writer
var out *os.File
if *outputFile != "" {
var err error
out, err = os.Create(*outputFile)
if err != nil {
fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err)
os.Exit(1)
}
defer func() { _ = out.Close() }()
} else {
out = os.Stdout
}
inputDelim, err := gsheet2csv.DecodeDelimiter(*readDelimString)
if err != nil {
fmt.Fprintf(os.Stderr, "Error decoding input delimiter: %v\n", err)
os.Exit(1)
}
delim, err = gsheet2csv.DecodeDelimiter(*delimString)
if err != nil {
fmt.Fprintf(os.Stderr, "Error decoding output delimiter: %v\n", err)
os.Exit(1)
}
var rc io.ReadCloser
if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") {
docid, gid := gsheet2csv.ParseIDs(url)
if *parseOnly {
fmt.Printf("docid=%s\ngid=%s\n", docid, gid)
} else {
fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid)
}
sheetURL := gsheet2csv.ToCSVURL(docid, gid)
if *urlOnly {
fmt.Printf("%s\n", sheetURL)
} else {
fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL)
}
if !*urlOnly {
resp, err := gsheet2csv.GetSheet(docid, gid)
if err != nil {
fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err)
os.Exit(1)
}
defer func() { _ = resp.Body.Close() }()
rc = resp.Body
}
} else {
url = strings.TrimPrefix(url, "file://")
fmt.Fprintf(os.Stderr, "opening %s\n", url)
f, err := os.Open(url)
if err != nil {
fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err)
os.Exit(1)
}
rc = f
}
if out == os.Stdout {
fmt.Fprintf(os.Stderr, "\n")
}
if *urlOnly || *parseOnly {
os.Exit(0)
return
}
if *rawOnly {
if _, err := io.Copy(out, rc); err != nil {
fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err)
os.Exit(1)
}
return
}
var comment rune
if commentArg == "0" {
comment = 0
} else {
comment, _ = utf8.DecodeRuneInString(commentArg)
}
// Create a reader for the Google Sheet
var csvr CSVReader
if *readStyle == "rfc" {
rfcr := csv.NewReader(rc)
rfcr.Comma = inputDelim
rfcr.Comment = comment
rfcr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not
csvr = rfcr
} else {
gsr := gsheet2csv.NewReader(rc)
gsr.Comma = inputDelim
if *noReadComments {
gsr.Comment = comment
} else {
gsr.Comment = 0
}
gsr.ReuseRecord = true
csvr = gsr
}
// Create CSV writer
var csvw CSVWriter
// if *writeStyle == "gsheet"
{
gsw := gsheet2csv.NewWriter(out)
gsw.QuoteAmbiguousComments = *writeStyle == "gsheet"
gsw.Comment = comment
gsw.Comma = delim // Set delimiter to tab for TSV
gsw.UseCRLF = *useCRLF
csvw = gsw
}
// else {
// rfcw := csv.NewWriter(out)
// rfcw.Comma = delim
// rfcw.UseCRLF = *useCRLF
// csvw = rfcw
// }
for {
// Convert each record
record, err := csvr.Read()
if err != nil {
if errors.Is(err, io.EOF) {
break
}
fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err)
os.Exit(1)
return
}
if err := csvw.Write(record); err != nil {
fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err)
os.Exit(1)
return
}
}
csvw.Flush()
if err := csvw.Error(); err != nil {
fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err)
os.Exit(1)
}
if out != os.Stdout {
fmt.Fprintf(os.Stderr, "wrote %s\n", *outputFile)
}
}

View File

@ -0,0 +1,33 @@
package main
import (
"fmt"
"os"
"github.com/therootcompany/golib/io/transform/gsheet2csv"
)
func main() {
switch len(os.Args) {
case 2:
break
case 1:
fmt.Fprintf(os.Stderr, "Usage: %s <url>\n", os.Args[0])
os.Exit(1)
}
urlOrPath := os.Args[1]
gsr := gsheet2csv.NewReaderFrom(urlOrPath)
records, err := gsr.ReadAll()
if err != nil {
fmt.Fprintf(os.Stderr, "Error reading from %s: %v\n", gsr.URL, err)
os.Exit(1)
}
csvw := gsheet2csv.NewWriter(os.Stdout)
csvw.Comment = gsr.Comment
if err := csvw.WriteAll(records); err != nil {
fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err)
os.Exit(1)
}
}

View File

@ -0,0 +1,18 @@
# this is a comment,,
"# this is, well, a quoted comment",,
"# this is a ""super""-quoted comment",,
Key,Value,
Name,55,
Girlfriend's Age,55,
,,
My IQ,55,
,55,
"Key,with,Comma",,
,"Value,with,Comma",
"Quoted ""Key""",Normal Value,
Normal Key,"Quoted ""Value""",
"Quoted ""Key""",,
,"Quoted ""Value""",
x,y,z
"# comment with trailing comma,",,
#1,2,#3
1 # this is a comment
2 # this is, well, a quoted comment
3 # this is a "super"-quoted comment
4 Key Value
5 Name 55
6 Girlfriend's Age 55
7
8 My IQ 55
9 55
10 Key,with,Comma
11 Value,with,Comma
12 Quoted "Key" Normal Value
13 Normal Key Quoted "Value"
14 Quoted "Key"
15 Quoted "Value"
16 x y z
17 # comment with trailing comma,
18 #1 2 #3

View File

@ -0,0 +1,14 @@
Key,Value,
Name,55,
Girlfriend's Age,55,
,,
My IQ,55,
,55,
"Key,with,Comma",,
,"Value,with,Comma",
"Quoted ""Key""",Normal Value,
Normal Key,"Quoted ""Value""",
"Quoted ""Key""",,
,"Quoted ""Value""",
x,y,z
"#1",2,#3
1 Key Value
2 Name 55
3 Girlfriend's Age 55
4
5 My IQ 55
6 55
7 Key,with,Comma
8 Value,with,Comma
9 Quoted "Key" Normal Value
10 Normal Key Quoted "Value"
11 Quoted "Key"
12 Quoted "Value"
13 x y z
14 #1 2 #3

View File

@ -0,0 +1,18 @@
# this is a comment
"# this is, well, a quoted comment"
"# this is a ""super""-quoted comment"
Key,Value,
Name,55,
Girlfriend's Age,55,
,,
My IQ,55,
,55,
"Key,with,Comma",,
,"Value,with,Comma",
"Quoted ""Key""",Normal Value,
Normal Key,"Quoted ""Value""",
"Quoted ""Key""",,
,"Quoted ""Value""",
x,y,z
"# comment with trailing comma,"
"#1",2,#3
Can't render this file because it contains an unexpected character in line 10 and column 16.

View File

@ -0,0 +1,18 @@
# this is a comment
# this is, well, a quoted comment
# this is a "super"-quoted comment
Key,Value,
Name,55,
Girlfriend's Age,55,
,,
My IQ,55,
,55,
"Key,with,Comma",,
,"Value,with,Comma",
"Quoted ""Key""",Normal Value,
Normal Key,"Quoted ""Value""",
"Quoted ""Key""",,
,"Quoted ""Value""",
x,y,z
# comment with trailing comma,
"#1",2,#3
Can't render this file because it contains an unexpected character in line 3 and column 13.

View File

@ -0,0 +1,18 @@
# this is a comment
# this is, well, a quoted comment
# this is a "super"-quoted comment
Key Value
Name 55
Girlfriend's Age 55
My IQ 55
55
Key,with,Comma
Value,with,Comma
"Quoted ""Key""" Normal Value
Normal Key "Quoted ""Value"""
"Quoted ""Key"""
"Quoted ""Value"""
x y z
# comment with trailing comma,
"#1" 2 #3
Can't render this file because it contains an unexpected character in line 3 and column 13.

View File

@ -0,0 +1,3 @@
module github.com/therootcompany/golib/io/transform/gsheet2csv
go 1.24.6

View File

@ -0,0 +1,309 @@
// Authored in 2025 by AJ ONeal <aj@therootcompany.com> (https://therootcompany.com)
//
// To the extent possible under law, the author(s) have dedicated all copyright
// and related and neighboring rights to this software to the public domain
// worldwide. This software is distributed without any warranty.
//
// You should have received a copy of the CC0 Public Domain Dedication along with
// this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.
//
// SPDX-License-Identifier: CC0-1.0
package gsheet2csv
import (
"encoding/csv"
"errors"
"fmt"
"io"
"net/http"
"os"
"strings"
"unicode/utf8"
)
const (
fileSeparator = '\x1c'
groupSeparator = '\x1d'
recordSeparator = '\x1e'
unitSeparator = '\x1f'
)
var ErrHTTPGet = errors.New("did not get 200 OK when downloading from URL")
// For mocking for tests
var httpGet = http.Get
type Reader struct {
*csv.Reader
DocID string
GID string
URL string
Comment rune
r io.Reader
resp *http.Response
close bool
err error
}
func NewReaderFrom(urlOrPath string) *Reader {
if strings.HasPrefix(urlOrPath, "https://") || strings.HasPrefix(urlOrPath, "http://") {
return NewReaderFromURL(urlOrPath)
}
urlOrPath = strings.TrimPrefix(urlOrPath, "file://")
f, err := os.Open(urlOrPath)
r := NewReader(f)
r.URL = urlOrPath
if err != nil {
r.err = err
}
return r
}
func NewReaderFromURL(url string) *Reader {
docid, gid := ParseIDs(url)
return NewReaderFromIDs(docid, gid)
}
func NewReaderFromIDs(docid, gid string) *Reader {
resp, err := GetSheet(docid, gid)
if err != nil {
r := NewReader(nil)
r.err = err
return r
}
r := NewReader(resp.Body)
r.URL = ToCSVURL(docid, gid)
r.DocID = docid
r.GID = gid
r.resp = resp
r.close = true
return r
}
func ToCSVURL(docid, gid string) string {
return fmt.Sprintf("https://docs.google.com/spreadsheets/d/%s/export?format=csv&usp=sharing&gid=%s", docid, gid)
}
func GetSheet(docid, gid string) (*http.Response, error) {
downloadURL := ToCSVURL(docid, gid)
resp, err := httpGet(downloadURL)
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusOK {
_ = resp.Body.Close()
return nil, ErrHTTPGet
}
return resp, nil
}
func NewReader(r io.Reader) *Reader {
csvr := csv.NewReader(r)
csvr.Comma = ','
csvr.Comment = 0 // to allow distinguishing between quoted comments and fields
csvr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not
csvr.LazyQuotes = false // fields that need quotes use them correctly
csvr.TrimLeadingSpace = false
csvr.ReuseRecord = false
return &Reader{
Reader: csvr,
Comment: '#',
r: r,
}
}
func DecodeDelimiter(delimString string) (rune, error) {
switch delimString {
case "^_", "\\x1f":
delimString = string(unitSeparator)
case "^^", "\\x1e":
delimString = string(recordSeparator)
case "^]", "\\x1d":
delimString = string(groupSeparator)
case "^\\", "\\x1c":
delimString = string(fileSeparator)
case "^L", "\\f":
delimString = "\f"
case "^K", "\\v":
delimString = "\v"
case "^I", "\\t":
delimString = " "
}
delim, _ := utf8.DecodeRuneInString(delimString)
return delim, nil
}
func (r *Reader) Read() ([]string, error) {
if r.err != nil {
return nil, r.err
}
for {
record, err := r.Reader.Read()
if err != nil {
if r.close {
_ = r.resp.Body.Close()
}
return nil, err
}
if r.Comment > 0 {
if rv, _ := utf8.DecodeRuneInString(record[0]); rv == r.Comment {
last := len(record) - 1
for len(record[last]) == 0 {
last -= 1
}
if last == 0 {
continue
}
}
}
return record, nil
}
}
func (r *Reader) ReadAll() ([][]string, error) {
var records [][]string
for {
record, err := r.Read()
if nil != err {
if errors.Is(err, io.EOF) {
return records, nil
}
return records, err
}
records = append(records, record)
}
}
func ParseIDs(urlStr string) (docid string, gid string) {
// Find key: look for /spreadsheets/d/{key}
const prefix = "/spreadsheets/d/"
startIdx := strings.Index(urlStr, prefix)
if startIdx == -1 {
return "", gid
}
startIdx += len(prefix)
// Find end of key (next / or end of string)
endIdx := strings.Index(urlStr[startIdx:], "/")
if endIdx == -1 {
endIdx = len(urlStr)
} else {
endIdx += startIdx
}
docid = urlStr[startIdx:endIdx]
if docid == "" {
return "", ""
}
// Find gid: look for gid= and take until #, &, ?, /, or end
gidIdx := strings.Index(urlStr, "gid=")
if gidIdx != -1 {
gidStart := gidIdx + len("gid=")
endChars := "#&?/"
gidEnd := strings.IndexAny(urlStr[gidStart:], endChars)
if gidEnd == -1 {
gid = urlStr[gidStart:]
} else {
gid = urlStr[gidStart : gidStart+gidEnd]
}
}
if len(gid) == 0 {
gid = "0"
}
return docid, gid
}
type Writer struct {
*csv.Writer
Comment rune
QuoteAmbiguousComments bool
w io.Writer
}
func NewWriter(w io.Writer) *Writer {
return &Writer{
Writer: csv.NewWriter(w),
Comment: '#',
w: w,
}
}
func (w *Writer) Write(record []string) error {
// Not handling comments? Move along.
if w.Comment == 0 || len(record) == 0 {
return w.Writer.Write(record)
}
// First char not a comment char? Move along.
if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 != w.Comment {
return w.Writer.Write(record)
}
// Is this a true comment? Or data that should be quoted that begins with the comment char?
lastNonEmpty := len(record) - 1
if lastNonEmpty > -1 {
for len(record[lastNonEmpty]) == 0 {
lastNonEmpty -= 1
}
}
// We will be doing custom writes ahead
w.Flush()
var newline = "\n"
if w.UseCRLF {
newline = "\r\n"
}
// Write true comments out plain
first := 0
if lastNonEmpty == 0 {
record = record[:1]
if !w.QuoteAmbiguousComments {
if _, err := w.w.Write([]byte(record[0] + newline)); err != nil {
return err
}
return nil
}
// Quote the comment iff it contains quotes or commas, not universally
first = -1
}
// Quote if
// - the line contains quotes or commas
// - there are multiple fields and the first starts with a comment character
// (but NOT a single-field comment with no quotes or commas)
for i, f := range record {
if i == first || strings.Contains(f, `"`) || strings.Contains(f, string(w.Comma)) {
f = strings.ReplaceAll(f, `"`, `""`)
record[i] = `"` + f + `"`
}
}
line := strings.Join(record, string(w.Comma))
if _, err := w.w.Write([]byte(line + newline)); err != nil {
return err
}
return nil
}
func (w *Writer) WriteAll(records [][]string) error {
for _, r := range records {
if err := w.Write(r); err != nil {
return err
}
}
w.Flush()
return w.Error()
}

View File

@ -0,0 +1,249 @@
package gsheet2csv
import (
"errors"
"io"
"net/http"
"slices"
"strings"
"testing"
)
// mockHTTPClient allows controlling HTTP responses for testing.
type mockHTTPClient struct {
resp *http.Response
err error
}
func (m *mockHTTPClient) Get(url string) (*http.Response, error) {
return m.resp, m.err
}
// sampleCSV mimics the structure of ai-models.csv from the project README.
const sampleCSV = `# Generated by ollama list
"# Sample Quoted Comment, with ""quotes"" itself"
"NAME","ID","SIZE","MODIFIED"
"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago"
"gpt-oss:20b","aa4295ac10c3","13 GB","8 days ago"
"gpt-oss:latest","aa4295ac10c3","13 GB","7 weeks ago"
`
// malformedCSV for testing error handling.
const malformedCSV = `# Comment
"NAME","ID","SIZE","MODIFIED
"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago"
`
// TestParseIDs verifies the ParseIDs function for various URL formats.
func TestParseIDs(t *testing.T) {
tests := []struct {
name string
url string
wantDoc string
wantGid string
}{
{
name: "Google Sheets Edit / Share URL with gid",
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238",
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
wantGid: "559037238",
},
{
name: "Google Sheets CSV URL with gid",
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/export?format=csv&usp=sharing&gid=559037238",
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
wantGid: "559037238",
},
{
name: "URL without gid",
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit",
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
wantGid: "0",
},
{
name: "Invalid URL",
url: "https://example.com/invalid",
wantDoc: "",
wantGid: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotDoc, gotGid := ParseIDs(tt.url)
if gotDoc != tt.wantDoc {
t.Errorf("ParseIDs() docid = %q, want %q", gotDoc, tt.wantDoc)
}
if gotGid != tt.wantGid {
t.Errorf("ParseIDs() gid = %q, want %q", gotGid, tt.wantGid)
}
})
}
}
// TestNewReaderFromURL tests initializing a Reader from a Google Sheets URL.
func TestNewReaderFromURL(t *testing.T) {
originalGet := httpGet
defer func() { httpGet = originalGet }()
url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238"
// Test successful HTTP response
mockResp := &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(strings.NewReader(sampleCSV)),
}
client := &mockHTTPClient{resp: mockResp}
httpGet = client.Get
reader := NewReaderFromURL(url)
if reader.err != nil {
t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err)
}
if reader.resp != mockResp {
t.Error("NewReaderFromURL() did not set response correctly")
}
if !reader.close {
t.Error("NewReaderFromURL() did not set close flag")
}
// Test HTTP failure
client = &mockHTTPClient{resp: mockResp}
client.err = errors.New("network error")
httpGet = client.Get
reader = NewReaderFromURL(url)
if reader.err == nil {
t.Error("NewReaderFromURL() expected error, got nil")
}
// Test non-200 status
client = &mockHTTPClient{resp: &http.Response{
StatusCode: http.StatusNotFound,
Body: io.NopCloser(strings.NewReader("these aren't the droids you're looking for")),
}}
httpGet = client.Get
reader = NewReaderFromURL(url)
if reader.err == nil {
t.Error("NewReaderFromURL() expected error for non-200 status, got nil")
}
}
// TestRead tests the Read method for comment handling.
func TestRead(t *testing.T) {
tests := []struct {
name string
preserveComments bool
expected [][]string
}{
{
name: "Skip comments",
expected: [][]string{
{"NAME", "ID", "SIZE", "MODIFIED"},
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
},
},
{
name: "Don't skip comments",
preserveComments: true,
expected: [][]string{
{"# Generated by ollama list"},
{"# Sample Quoted Comment, with \"quotes\" itself"},
{"NAME", "ID", "SIZE", "MODIFIED"},
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := NewReader(strings.NewReader(sampleCSV))
if tt.preserveComments {
reader.Comment = 0
}
for i, want := range tt.expected {
got, err := reader.Read()
if err != nil {
t.Errorf("Read() error at record %d: %v", i, err)
}
if !slices.Equal(got, want) {
t.Errorf("Read() record %d = %v, want %v", i, got, want)
}
}
// Verify EOF
_, err := reader.Read()
if !errors.Is(err, io.EOF) {
t.Errorf("Read() expected EOF, got %v", err)
}
})
}
}
// TestReadAll tests the ReadAll method for different configurations.
func TestReadAll(t *testing.T) {
tests := []struct {
name string
expected [][]string
}{
{
name: "Skip comments",
expected: [][]string{
{"NAME", "ID", "SIZE", "MODIFIED"},
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := NewReader(strings.NewReader(sampleCSV))
got, err := reader.ReadAll()
if err != nil {
t.Errorf("ReadAll() error: %v", err)
}
if len(got) != len(tt.expected) {
t.Errorf("ReadAll() returned %d records, want %d", len(got), len(tt.expected))
}
for i, want := range tt.expected {
if !slices.Equal(got[i], want) {
t.Errorf("ReadAll() record %d = %v, want %v", i, got[i], want)
}
}
})
}
}
// TestNewReaderFromURLWithMalformedCSV tests NewReaderFromURL with malformed CSV.
func TestNewReaderFromURLWithMalformedCSV(t *testing.T) {
mockResp := &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(strings.NewReader(malformedCSV)),
}
client := &mockHTTPClient{resp: mockResp}
originalGet := httpGet
httpGet = client.Get
defer func() { httpGet = originalGet }()
url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238"
reader := NewReaderFromURL(url)
if reader.err != nil {
t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err)
}
// Reading should fail due to malformed CSV
_, err := reader.Read()
if err == nil {
t.Error("Read() expected error for malformed CSV, got nil")
}
}