mirror of
https://github.com/therootcompany/golib.git
synced 2025-10-12 20:18:16 +00:00
feat(gsheet2csv): parse URLs and CSVs with comments
This commit is contained in:
parent
dc951ce388
commit
cd00d85968
7
io/transforms/gsheet2csv/LICENSE
Normal file
7
io/transforms/gsheet2csv/LICENSE
Normal file
@ -0,0 +1,7 @@
|
||||
Authored in 2025 by AJ ONeal <aj@therootcompany.com>
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.
|
101
io/transforms/gsheet2csv/README.md
Normal file
101
io/transforms/gsheet2csv/README.md
Normal file
@ -0,0 +1,101 @@
|
||||
# gsheet2csv
|
||||
|
||||
[](https://pkg.go.dev/github.com/therootcompany/golib/io/transform/gsheet2csv)
|
||||
|
||||
A simple wrapper around `encoding/csv` to read Google Sheet CSVs from URL, or a given Reader.
|
||||
|
||||
This does surprisingly little - you should probably just handle the boilerplate yourself. However, these are the problems it solves for us:
|
||||
|
||||
- works with Google Sheet URLs, regardless of URL format
|
||||
- Edit URL: <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/edit?gid=0000000000#gid=0000000000>
|
||||
- Share URL (Sheet 1): <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/edit?usp=sharing>
|
||||
- CSV Export URL: <https://docs.google.com/spreadsheets/d/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/export?format=csv&usp=sharing&gid=0000000000>
|
||||
- anything with a path like `/spreadsheets/d/{docid}/` and (optionally) a hash or query param like `gid={gid}`
|
||||
- can ignore quoted comments (if all other fields in the row are empty)
|
||||
- can preserve comments
|
||||
- swaps `\r` (Windows) for `\n` (Unix) and ensures trailing newline (a la `encoding/csv`)
|
||||
|
||||
## Usage
|
||||
|
||||
Same as `encoding/csv` (embedded), but with two extra options:
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/therootcompany/golib/io/transform/gsheet2csv"
|
||||
)
|
||||
|
||||
func main() {
|
||||
switch len(os.Args) {
|
||||
case 2:
|
||||
break
|
||||
case 1:
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s <url>\n", os.Args[0])
|
||||
os.Exit(1)
|
||||
}
|
||||
url := os.Args[1]
|
||||
|
||||
gsr := gsheet2csv.NewReaderFromURL(url)
|
||||
records, err := gsr.ReadAll()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error reading from %s\n", gsr.URL)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// distinguishes between comments and quoted fields
|
||||
csvw := gsheet2csv.NewWriter(os.Stdout)
|
||||
csvw.Comment = gsr.Comment
|
||||
if err := csvw.WriteAll(records); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## CLI
|
||||
|
||||
There are two convenience utilities:
|
||||
|
||||
- `gsheet2csv`
|
||||
- `gsheet2tsv`
|
||||
|
||||
They're only slightly different from a direct export of a Google CSV in that they reformat comments and newlines.
|
||||
|
||||
### Flags & Options
|
||||
|
||||
```text
|
||||
--raw download without processing
|
||||
--print-ids print ids to stdout without download
|
||||
--print-url print url to stdout without downloading
|
||||
-o <filepath> write records to file
|
||||
-d field delimiter
|
||||
--comment '#' treat lines starting with # as comments
|
||||
--crlf use CRLF (\r\n) as record separator
|
||||
```
|
||||
|
||||
### Installation
|
||||
|
||||
```sh
|
||||
go get github.com/therootcompany/golib/io/transform/gsheet2csv
|
||||
```
|
||||
|
||||
### ASCII Delimiters
|
||||
|
||||
```
|
||||
, comma
|
||||
\t tab (or a normal tab)
|
||||
space (just a normal space)
|
||||
: colon
|
||||
; semicolon
|
||||
| pipe
|
||||
^_ unit separator
|
||||
^^ record separator
|
||||
^] group separator
|
||||
^\ file separator
|
||||
\f form feed (also ^L)
|
||||
\v vertical tab (also ^K)
|
||||
```
|
176
io/transforms/gsheet2csv/cmd/gsheet2csv/main.go
Normal file
176
io/transforms/gsheet2csv/cmd/gsheet2csv/main.go
Normal file
@ -0,0 +1,176 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/therootcompany/golib/io/transform/gsheet2csv"
|
||||
)
|
||||
|
||||
const (
|
||||
fileSeparator = "\x1c"
|
||||
groupSeparator = "\x1d"
|
||||
recordSeparator = "\x1e"
|
||||
unitSeparator = "\x1f"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var commentArg string
|
||||
format := "CSV"
|
||||
delim := ','
|
||||
if strings.Contains(os.Args[0], "tsv") {
|
||||
delim = '\t'
|
||||
format = "TSV"
|
||||
}
|
||||
|
||||
// Parse command-line flags
|
||||
flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments")
|
||||
outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)")
|
||||
delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)")
|
||||
useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator")
|
||||
urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL")
|
||||
parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)")
|
||||
rawOnly := flag.Bool("raw", false, "don't parse, just download")
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [flags] <google-sheet-url-or-file-path>\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format)
|
||||
fmt.Fprintf(os.Stderr, "Flags:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExample:\n")
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0])
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
// Check for URL argument
|
||||
if len(flag.Args()) != 1 {
|
||||
fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n")
|
||||
flag.Usage()
|
||||
os.Exit(1)
|
||||
}
|
||||
url := flag.Args()[0]
|
||||
|
||||
// Prepare output writer
|
||||
var out *os.File
|
||||
if *outputFile != "" {
|
||||
var err error
|
||||
out, err = os.Create(*outputFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer func() { _ = out.Close() }()
|
||||
} else {
|
||||
out = os.Stdout
|
||||
}
|
||||
|
||||
switch *delimString {
|
||||
case "^_", "\\x1f":
|
||||
*delimString = unitSeparator
|
||||
case "^^", "\\x1e":
|
||||
*delimString = recordSeparator
|
||||
case "^]", "\\x1d":
|
||||
*delimString = groupSeparator
|
||||
case "^\\", "\\x1c":
|
||||
*delimString = fileSeparator
|
||||
case "^L", "\\f":
|
||||
*delimString = "\f"
|
||||
case "^K", "\\v":
|
||||
*delimString = "\v"
|
||||
case "^I", "\\t":
|
||||
*delimString = "\t"
|
||||
}
|
||||
delim, _ = utf8.DecodeRuneInString(*delimString)
|
||||
|
||||
var rc io.ReadCloser
|
||||
if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") {
|
||||
docid, gid := gsheet2csv.ParseIDs(url)
|
||||
if *parseOnly {
|
||||
fmt.Printf("docid=%s\ngid=%s\n", docid, gid)
|
||||
} else {
|
||||
fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid)
|
||||
}
|
||||
|
||||
sheetURL := gsheet2csv.ToCSVURL(docid, gid)
|
||||
if *urlOnly {
|
||||
fmt.Printf("%s\n", sheetURL)
|
||||
} else {
|
||||
fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL)
|
||||
}
|
||||
|
||||
if !*urlOnly {
|
||||
resp, err := gsheet2csv.GetSheet(docid, gid)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
rc = resp.Body
|
||||
}
|
||||
} else {
|
||||
url = strings.TrimPrefix(url, "file://")
|
||||
fmt.Fprintf(os.Stderr, "opening %s\n", url)
|
||||
f, err := os.Open(url)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
rc = f
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "\n")
|
||||
|
||||
if *urlOnly || *parseOnly {
|
||||
os.Exit(0)
|
||||
return
|
||||
}
|
||||
|
||||
if *rawOnly {
|
||||
if _, err := io.Copy(out, rc); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
comment, _ := utf8.DecodeRuneInString(commentArg)
|
||||
|
||||
// Create a reader for the Google Sheet
|
||||
gsr := gsheet2csv.NewReader(rc)
|
||||
gsr.QuotedComments = false
|
||||
gsr.Comment = 0
|
||||
gsr.ReuseRecord = true
|
||||
|
||||
// Create CSV writer
|
||||
csvw := gsheet2csv.NewWriter(out)
|
||||
csvw.Comma = delim // Set delimiter to tab for TSV
|
||||
csvw.Comment = comment
|
||||
csvw.UseCRLF = *useCRLF
|
||||
for {
|
||||
// Convert each record
|
||||
record, err := gsr.Read()
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := csvw.Write(record); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
// Flush the writer to ensure all data is written
|
||||
csvw.Flush()
|
||||
if err := csvw.Error(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
176
io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go
Normal file
176
io/transforms/gsheet2csv/cmd/gsheet2tsv/main.go
Normal file
@ -0,0 +1,176 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/therootcompany/golib/io/transform/gsheet2csv"
|
||||
)
|
||||
|
||||
const (
|
||||
fileSeparator = "\x1c"
|
||||
groupSeparator = "\x1d"
|
||||
recordSeparator = "\x1e"
|
||||
unitSeparator = "\x1f"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var commentArg string
|
||||
format := "CSV"
|
||||
delim := ','
|
||||
if strings.Contains(os.Args[0], "tsv") {
|
||||
delim = '\t'
|
||||
format = "TSV"
|
||||
}
|
||||
|
||||
// Parse command-line flags
|
||||
flag.StringVar(&commentArg, "comment", "#", "treat lines beginning with this rune as comments")
|
||||
outputFile := flag.String("o", "", "Output "+format+" file (default: stdout)")
|
||||
delimString := flag.String("d", string(delim), "field delimiter to use for output file ('\\t' for tab, '^_' for Unit Separator, etc)")
|
||||
useCRLF := flag.Bool("crlf", false, "use CRLF (\\r\\n) as record separator")
|
||||
urlOnly := flag.Bool("print-url", false, "don't download, just print the Google Sheet URL")
|
||||
parseOnly := flag.Bool("print-ids", false, "don't download, just print the Doc ID and Sheet ID (gid)")
|
||||
rawOnly := flag.Bool("raw", false, "don't parse, just download")
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [flags] <google-sheet-url-or-file-path>\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, "Converts a Google Sheet to %s format.\n\n", format)
|
||||
fmt.Fprintf(os.Stderr, "Flags:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExample:\n")
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238'\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv 'file://gsheet.csv'\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " %s -o output.tsv './gsheet.csv'\n", os.Args[0])
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
// Check for URL argument
|
||||
if len(flag.Args()) != 1 {
|
||||
fmt.Fprintf(os.Stderr, "Error: exactly one Google Sheet URL is required\n")
|
||||
flag.Usage()
|
||||
os.Exit(1)
|
||||
}
|
||||
url := flag.Args()[0]
|
||||
|
||||
// Prepare output writer
|
||||
var out *os.File
|
||||
if *outputFile != "" {
|
||||
var err error
|
||||
out, err = os.Create(*outputFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer func() { _ = out.Close() }()
|
||||
} else {
|
||||
out = os.Stdout
|
||||
}
|
||||
|
||||
switch *delimString {
|
||||
case "^_", "\\x1f":
|
||||
*delimString = unitSeparator
|
||||
case "^^", "\\x1e":
|
||||
*delimString = recordSeparator
|
||||
case "^]", "\\x1d":
|
||||
*delimString = groupSeparator
|
||||
case "^\\", "\\x1c":
|
||||
*delimString = fileSeparator
|
||||
case "^L", "\\f":
|
||||
*delimString = "\f"
|
||||
case "^K", "\\v":
|
||||
*delimString = "\v"
|
||||
case "^I", "\\t":
|
||||
*delimString = "\t"
|
||||
}
|
||||
delim, _ = utf8.DecodeRuneInString(*delimString)
|
||||
|
||||
var rc io.ReadCloser
|
||||
if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") {
|
||||
docid, gid := gsheet2csv.ParseIDs(url)
|
||||
if *parseOnly {
|
||||
fmt.Printf("docid=%s\ngid=%s\n", docid, gid)
|
||||
} else {
|
||||
fmt.Fprintf(os.Stderr, "docid=%s\ngid=%s\n", docid, gid)
|
||||
}
|
||||
|
||||
sheetURL := gsheet2csv.ToCSVURL(docid, gid)
|
||||
if *urlOnly {
|
||||
fmt.Printf("%s\n", sheetURL)
|
||||
} else {
|
||||
fmt.Fprintf(os.Stderr, "downloading %s\n", sheetURL)
|
||||
}
|
||||
|
||||
if !*urlOnly {
|
||||
resp, err := gsheet2csv.GetSheet(docid, gid)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error getting url: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
rc = resp.Body
|
||||
}
|
||||
} else {
|
||||
url = strings.TrimPrefix(url, "file://")
|
||||
fmt.Fprintf(os.Stderr, "opening %s\n", url)
|
||||
f, err := os.Open(url)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error opening file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
rc = f
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "\n")
|
||||
|
||||
if *urlOnly || *parseOnly {
|
||||
os.Exit(0)
|
||||
return
|
||||
}
|
||||
|
||||
if *rawOnly {
|
||||
if _, err := io.Copy(out, rc); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error getting url body: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
comment, _ := utf8.DecodeRuneInString(commentArg)
|
||||
|
||||
// Create a reader for the Google Sheet
|
||||
gsr := gsheet2csv.NewReader(rc)
|
||||
gsr.QuotedComments = false
|
||||
gsr.Comment = 0
|
||||
gsr.ReuseRecord = true
|
||||
|
||||
// Create CSV writer
|
||||
csvw := gsheet2csv.NewWriter(out)
|
||||
csvw.Comma = delim // Set delimiter to tab for TSV
|
||||
csvw.Comment = comment
|
||||
csvw.UseCRLF = *useCRLF
|
||||
for {
|
||||
// Convert each record
|
||||
record, err := gsr.Read()
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "Error reading "+format+": %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := csvw.Write(record); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing "+format+": %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
// Flush the writer to ensure all data is written
|
||||
csvw.Flush()
|
||||
if err := csvw.Error(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error flushing "+format+" writer: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
33
io/transforms/gsheet2csv/fixtures/example.go
Normal file
33
io/transforms/gsheet2csv/fixtures/example.go
Normal file
@ -0,0 +1,33 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/therootcompany/golib/io/transform/gsheet2csv"
|
||||
)
|
||||
|
||||
func main() {
|
||||
switch len(os.Args) {
|
||||
case 2:
|
||||
break
|
||||
case 1:
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s <url>\n", os.Args[0])
|
||||
os.Exit(1)
|
||||
}
|
||||
url := os.Args[1]
|
||||
|
||||
gsr := gsheet2csv.NewReaderFromURL(url)
|
||||
records, err := gsr.ReadAll()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error reading from %s\n", gsr.URL)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
csvw := gsheet2csv.NewWriter(os.Stdout)
|
||||
csvw.Comment = gsr.Comment
|
||||
if err := csvw.WriteAll(records); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing csv %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
18
io/transforms/gsheet2csv/fixtures/gsheet.csv
Normal file
18
io/transforms/gsheet2csv/fixtures/gsheet.csv
Normal file
@ -0,0 +1,18 @@
|
||||
# this is a comment,,
|
||||
"# this is, well, a quoted comment",,
|
||||
"# this is a ""super""-quoted comment",,
|
||||
Key,Value,
|
||||
Name,55,
|
||||
Girlfriend's Age,55,
|
||||
,,
|
||||
My IQ,55,
|
||||
,55,
|
||||
"Key,with,Comma",,
|
||||
,"Value,with,Comma",
|
||||
"Quoted ""Key""",Normal Value,
|
||||
Normal Key,"Quoted ""Value""",
|
||||
"Quoted ""Key""",,
|
||||
,"Quoted ""Value""",
|
||||
x,y,z
|
||||
"# comment with trailing comma,",,
|
||||
#1,2,#3
|
|
3
io/transforms/gsheet2csv/go.mod
Normal file
3
io/transforms/gsheet2csv/go.mod
Normal file
@ -0,0 +1,3 @@
|
||||
module github.com/therootcompany/golib/io/transform/gsheet2csv
|
||||
|
||||
go 1.24.6
|
241
io/transforms/gsheet2csv/gsheet2csv.go
Normal file
241
io/transforms/gsheet2csv/gsheet2csv.go
Normal file
@ -0,0 +1,241 @@
|
||||
// Authored in 2025 by AJ ONeal <aj@therootcompany.com> (https://therootcompany.com)
|
||||
//
|
||||
// To the extent possible under law, the author(s) have dedicated all copyright
|
||||
// and related and neighboring rights to this software to the public domain
|
||||
// worldwide. This software is distributed without any warranty.
|
||||
//
|
||||
// You should have received a copy of the CC0 Public Domain Dedication along with
|
||||
// this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
//
|
||||
// SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
package gsheet2csv
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var ErrHTTPGet = errors.New("did not get 200 OK when downloading from URL")
|
||||
|
||||
// For mocking for tests
|
||||
var httpGet = http.Get
|
||||
|
||||
type Reader struct {
|
||||
*csv.Reader
|
||||
DocID string
|
||||
GID string
|
||||
URL string
|
||||
QuotedComments bool
|
||||
Comment rune
|
||||
r io.Reader
|
||||
resp *http.Response
|
||||
close bool
|
||||
err error
|
||||
}
|
||||
|
||||
func NewReaderFromURL(url string) *Reader {
|
||||
docid, gid := ParseIDs(url)
|
||||
|
||||
return NewReaderFromIDs(docid, gid)
|
||||
}
|
||||
|
||||
func NewReaderFromIDs(docid, gid string) *Reader {
|
||||
resp, err := GetSheet(docid, gid)
|
||||
if err != nil {
|
||||
r := NewReader(nil)
|
||||
r.err = err
|
||||
return r
|
||||
}
|
||||
|
||||
r := NewReader(resp.Body)
|
||||
r.URL = ToCSVURL(docid, gid)
|
||||
r.DocID = docid
|
||||
r.GID = gid
|
||||
r.resp = resp
|
||||
r.close = true
|
||||
return r
|
||||
}
|
||||
|
||||
func ToCSVURL(docid, gid string) string {
|
||||
return fmt.Sprintf("https://docs.google.com/spreadsheets/d/%s/export?format=csv&usp=sharing&gid=%s", docid, gid)
|
||||
}
|
||||
|
||||
func GetSheet(docid, gid string) (*http.Response, error) {
|
||||
downloadURL := ToCSVURL(docid, gid)
|
||||
|
||||
resp, err := httpGet(downloadURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
_ = resp.Body.Close()
|
||||
return nil, ErrHTTPGet
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func NewReader(r io.Reader) *Reader {
|
||||
csvr := csv.NewReader(r)
|
||||
csvr.Comma = ','
|
||||
csvr.Comment = 0 // to allow distinguishing between quoted comments and fields
|
||||
csvr.FieldsPerRecord = 0 // Google Sheets is consistent
|
||||
csvr.LazyQuotes = false // fields that need quotes use them correctly
|
||||
csvr.TrimLeadingSpace = false
|
||||
csvr.ReuseRecord = false
|
||||
return &Reader{
|
||||
Reader: csvr,
|
||||
QuotedComments: true,
|
||||
Comment: '#',
|
||||
r: r,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Reader) Read() ([]string, error) {
|
||||
if r.err != nil {
|
||||
return nil, r.err
|
||||
}
|
||||
|
||||
for {
|
||||
record, err := r.Reader.Read()
|
||||
if err != nil {
|
||||
if r.close {
|
||||
_ = r.resp.Body.Close()
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if r.QuotedComments && len(record[0]) > 0 {
|
||||
runeValue, _ := utf8.DecodeRuneInString(record[0])
|
||||
if runeValue == r.Comment {
|
||||
last := len(record) - 1
|
||||
for len(record[last]) == 0 {
|
||||
last -= 1
|
||||
}
|
||||
if last == 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Reader) ReadAll() ([][]string, error) {
|
||||
var records [][]string
|
||||
|
||||
for {
|
||||
record, err := r.Read()
|
||||
if nil != err {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return records, nil
|
||||
}
|
||||
return records, err
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
}
|
||||
|
||||
func ParseIDs(urlStr string) (docid string, gid string) {
|
||||
// Find key: look for /spreadsheets/d/{key}
|
||||
const prefix = "/spreadsheets/d/"
|
||||
startIdx := strings.Index(urlStr, prefix)
|
||||
if startIdx == -1 {
|
||||
return "", gid
|
||||
}
|
||||
startIdx += len(prefix)
|
||||
|
||||
// Find end of key (next / or end of string)
|
||||
endIdx := strings.Index(urlStr[startIdx:], "/")
|
||||
if endIdx == -1 {
|
||||
endIdx = len(urlStr)
|
||||
} else {
|
||||
endIdx += startIdx
|
||||
}
|
||||
|
||||
docid = urlStr[startIdx:endIdx]
|
||||
if docid == "" {
|
||||
return "", ""
|
||||
}
|
||||
|
||||
// Find gid: look for gid= and take until #, &, ?, /, or end
|
||||
gidIdx := strings.Index(urlStr, "gid=")
|
||||
if gidIdx != -1 {
|
||||
gidStart := gidIdx + len("gid=")
|
||||
endChars := "#&?/"
|
||||
gidEnd := strings.IndexAny(urlStr[gidStart:], endChars)
|
||||
if gidEnd == -1 {
|
||||
gid = urlStr[gidStart:]
|
||||
} else {
|
||||
gid = urlStr[gidStart : gidStart+gidEnd]
|
||||
}
|
||||
}
|
||||
|
||||
if len(gid) == 0 {
|
||||
gid = "0"
|
||||
}
|
||||
return docid, gid
|
||||
}
|
||||
|
||||
type Writer struct {
|
||||
*csv.Writer
|
||||
Comment rune
|
||||
w io.Writer
|
||||
}
|
||||
|
||||
func NewWriter(w io.Writer) *Writer {
|
||||
return &Writer{
|
||||
Writer: csv.NewWriter(w),
|
||||
Comment: '#',
|
||||
w: w,
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Writer) Write(record []string) error {
|
||||
if len(record) > 1 {
|
||||
if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 == w.Comment {
|
||||
w.Flush()
|
||||
|
||||
lastNonEmpty := len(record) - 1
|
||||
for len(record[lastNonEmpty]) == 0 {
|
||||
lastNonEmpty -= 1
|
||||
}
|
||||
|
||||
if lastNonEmpty == 0 {
|
||||
record = record[:1]
|
||||
} else {
|
||||
for i, f := range record {
|
||||
if i == 0 || strings.Contains(f, `"`) {
|
||||
f = strings.ReplaceAll(f, `"`, `""`)
|
||||
record[i] = `"` + f + `"`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
line := strings.Join(record, string(w.Comma))
|
||||
if _, err := w.w.Write([]byte(line + "\n")); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return w.Writer.Write(record)
|
||||
}
|
||||
|
||||
func (w *Writer) WriteAll(records [][]string) error {
|
||||
for _, r := range records {
|
||||
if err := w.Write(r); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
w.Flush()
|
||||
return w.Error()
|
||||
}
|
250
io/transforms/gsheet2csv/gsheet2csv_test.go
Normal file
250
io/transforms/gsheet2csv/gsheet2csv_test.go
Normal file
@ -0,0 +1,250 @@
|
||||
package gsheet2csv
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mockHTTPClient allows controlling HTTP responses for testing.
|
||||
type mockHTTPClient struct {
|
||||
resp *http.Response
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockHTTPClient) Get(url string) (*http.Response, error) {
|
||||
return m.resp, m.err
|
||||
}
|
||||
|
||||
// sampleCSV mimics the structure of ai-models.csv from the project README.
|
||||
const sampleCSV = `# Generated by ollama list
|
||||
"# Sample Quoted Comment, with ""quotes"" itself"
|
||||
"NAME","ID","SIZE","MODIFIED"
|
||||
"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago"
|
||||
"gpt-oss:20b","aa4295ac10c3","13 GB","8 days ago"
|
||||
|
||||
"gpt-oss:latest","aa4295ac10c3","13 GB","7 weeks ago"
|
||||
`
|
||||
|
||||
// malformedCSV for testing error handling.
|
||||
const malformedCSV = `# Comment
|
||||
"NAME","ID","SIZE","MODIFIED
|
||||
"qwen3-coder:30b","06c1097efce0","18 GB","8 days ago"
|
||||
`
|
||||
|
||||
// TestParseIDs verifies the ParseIDs function for various URL formats.
|
||||
func TestParseIDs(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
wantDoc string
|
||||
wantGid string
|
||||
}{
|
||||
{
|
||||
name: "Google Sheets Edit / Share URL with gid",
|
||||
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238#gid=559037238",
|
||||
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
|
||||
wantGid: "559037238",
|
||||
},
|
||||
{
|
||||
name: "Google Sheets CSV URL with gid",
|
||||
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/export?format=csv&usp=sharing&gid=559037238",
|
||||
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
|
||||
wantGid: "559037238",
|
||||
},
|
||||
{
|
||||
name: "URL without gid",
|
||||
url: "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit",
|
||||
wantDoc: "1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34",
|
||||
wantGid: "0",
|
||||
},
|
||||
{
|
||||
name: "Invalid URL",
|
||||
url: "https://example.com/invalid",
|
||||
wantDoc: "",
|
||||
wantGid: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
gotDoc, gotGid := ParseIDs(tt.url)
|
||||
if gotDoc != tt.wantDoc {
|
||||
t.Errorf("ParseIDs() docid = %q, want %q", gotDoc, tt.wantDoc)
|
||||
}
|
||||
if gotGid != tt.wantGid {
|
||||
t.Errorf("ParseIDs() gid = %q, want %q", gotGid, tt.wantGid)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewReaderFromURL tests initializing a Reader from a Google Sheets URL.
|
||||
func TestNewReaderFromURL(t *testing.T) {
|
||||
originalGet := httpGet
|
||||
defer func() { httpGet = originalGet }()
|
||||
|
||||
url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238"
|
||||
|
||||
// Test successful HTTP response
|
||||
mockResp := &http.Response{
|
||||
StatusCode: http.StatusOK,
|
||||
Body: io.NopCloser(strings.NewReader(sampleCSV)),
|
||||
}
|
||||
client := &mockHTTPClient{resp: mockResp}
|
||||
httpGet = client.Get
|
||||
|
||||
reader := NewReaderFromURL(url)
|
||||
if reader.err != nil {
|
||||
t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err)
|
||||
}
|
||||
if reader.resp != mockResp {
|
||||
t.Error("NewReaderFromURL() did not set response correctly")
|
||||
}
|
||||
if !reader.close {
|
||||
t.Error("NewReaderFromURL() did not set close flag")
|
||||
}
|
||||
|
||||
// Test HTTP failure
|
||||
client = &mockHTTPClient{resp: mockResp}
|
||||
client.err = errors.New("network error")
|
||||
httpGet = client.Get
|
||||
|
||||
reader = NewReaderFromURL(url)
|
||||
if reader.err == nil {
|
||||
t.Error("NewReaderFromURL() expected error, got nil")
|
||||
}
|
||||
|
||||
// Test non-200 status
|
||||
client = &mockHTTPClient{resp: &http.Response{
|
||||
StatusCode: http.StatusNotFound,
|
||||
Body: io.NopCloser(strings.NewReader("these aren't the droids you're looking for")),
|
||||
}}
|
||||
httpGet = client.Get
|
||||
|
||||
reader = NewReaderFromURL(url)
|
||||
if reader.err == nil {
|
||||
t.Error("NewReaderFromURL() expected error for non-200 status, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRead tests the Read method for comment handling.
|
||||
func TestRead(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
quotedComments bool
|
||||
expected [][]string
|
||||
}{
|
||||
{
|
||||
name: "Skip comments",
|
||||
quotedComments: true,
|
||||
expected: [][]string{
|
||||
{"NAME", "ID", "SIZE", "MODIFIED"},
|
||||
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
|
||||
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
|
||||
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Don't skip quoted comments",
|
||||
quotedComments: false,
|
||||
expected: [][]string{
|
||||
{"# Sample Quoted Comment, with \"quotes\" itself"},
|
||||
{"NAME", "ID", "SIZE", "MODIFIED"},
|
||||
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
|
||||
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
|
||||
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
reader := NewReader(strings.NewReader(sampleCSV))
|
||||
reader.QuotedComments = tt.quotedComments
|
||||
|
||||
for i, want := range tt.expected {
|
||||
got, err := reader.Read()
|
||||
if err != nil {
|
||||
t.Errorf("Read() error at record %d: %v", i, err)
|
||||
}
|
||||
if !slices.Equal(got, want) {
|
||||
t.Errorf("Read() record %d = %v, want %v", i, got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify EOF
|
||||
_, err := reader.Read()
|
||||
if !errors.Is(err, io.EOF) {
|
||||
t.Errorf("Read() expected EOF, got %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestReadAll tests the ReadAll method for different configurations.
|
||||
func TestReadAll(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
quotedComments bool
|
||||
expected [][]string
|
||||
}{
|
||||
{
|
||||
name: "Skip comments",
|
||||
quotedComments: true,
|
||||
expected: [][]string{
|
||||
{"NAME", "ID", "SIZE", "MODIFIED"},
|
||||
{"qwen3-coder:30b", "06c1097efce0", "18 GB", "8 days ago"},
|
||||
{"gpt-oss:20b", "aa4295ac10c3", "13 GB", "8 days ago"},
|
||||
{"gpt-oss:latest", "aa4295ac10c3", "13 GB", "7 weeks ago"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
reader := NewReader(strings.NewReader(sampleCSV))
|
||||
reader.QuotedComments = tt.quotedComments
|
||||
|
||||
got, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
t.Errorf("ReadAll() error: %v", err)
|
||||
}
|
||||
if len(got) != len(tt.expected) {
|
||||
t.Errorf("ReadAll() returned %d records, want %d", len(got), len(tt.expected))
|
||||
}
|
||||
for i, want := range tt.expected {
|
||||
if !slices.Equal(got[i], want) {
|
||||
t.Errorf("ReadAll() record %d = %v, want %v", i, got[i], want)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewReaderFromURLWithMalformedCSV tests NewReaderFromURL with malformed CSV.
|
||||
func TestNewReaderFromURLWithMalformedCSV(t *testing.T) {
|
||||
mockResp := &http.Response{
|
||||
StatusCode: http.StatusOK,
|
||||
Body: io.NopCloser(strings.NewReader(malformedCSV)),
|
||||
}
|
||||
client := &mockHTTPClient{resp: mockResp}
|
||||
originalGet := httpGet
|
||||
httpGet = client.Get
|
||||
defer func() { httpGet = originalGet }()
|
||||
|
||||
url := "https://docs.google.com/spreadsheets/d/1KdNsc63pk0QRerWDPcIL9cMnGQlG-9Ue9Jlf0PAAA34/edit?gid=559037238"
|
||||
reader := NewReaderFromURL(url)
|
||||
if reader.err != nil {
|
||||
t.Errorf("NewReaderFromURL() unexpected error: %v", reader.err)
|
||||
}
|
||||
|
||||
// Reading should fail due to malformed CSV
|
||||
_, err := reader.Read()
|
||||
if err == nil {
|
||||
t.Error("Read() expected error for malformed CSV, got nil")
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user