golib/io/transforms/gsheet2csv/gsheet2csv.go

242 lines
4.9 KiB
Go

// Authored in 2025 by AJ ONeal <aj@therootcompany.com> (https://therootcompany.com)
//
// To the extent possible under law, the author(s) have dedicated all copyright
// and related and neighboring rights to this software to the public domain
// worldwide. This software is distributed without any warranty.
//
// You should have received a copy of the CC0 Public Domain Dedication along with
// this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.
//
// SPDX-License-Identifier: CC0-1.0
package gsheet2csv
import (
"encoding/csv"
"errors"
"fmt"
"io"
"net/http"
"strings"
"unicode/utf8"
)
var ErrHTTPGet = errors.New("did not get 200 OK when downloading from URL")
// For mocking for tests
var httpGet = http.Get
type Reader struct {
*csv.Reader
DocID string
GID string
URL string
QuotedComments bool
Comment rune
r io.Reader
resp *http.Response
close bool
err error
}
func NewReaderFromURL(url string) *Reader {
docid, gid := ParseIDs(url)
return NewReaderFromIDs(docid, gid)
}
func NewReaderFromIDs(docid, gid string) *Reader {
resp, err := GetSheet(docid, gid)
if err != nil {
r := NewReader(nil)
r.err = err
return r
}
r := NewReader(resp.Body)
r.URL = ToCSVURL(docid, gid)
r.DocID = docid
r.GID = gid
r.resp = resp
r.close = true
return r
}
func ToCSVURL(docid, gid string) string {
return fmt.Sprintf("https://docs.google.com/spreadsheets/d/%s/export?format=csv&usp=sharing&gid=%s", docid, gid)
}
func GetSheet(docid, gid string) (*http.Response, error) {
downloadURL := ToCSVURL(docid, gid)
resp, err := httpGet(downloadURL)
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusOK {
_ = resp.Body.Close()
return nil, ErrHTTPGet
}
return resp, nil
}
func NewReader(r io.Reader) *Reader {
csvr := csv.NewReader(r)
csvr.Comma = ','
csvr.Comment = 0 // to allow distinguishing between quoted comments and fields
csvr.FieldsPerRecord = 0 // Google Sheets is consistent
csvr.LazyQuotes = false // fields that need quotes use them correctly
csvr.TrimLeadingSpace = false
csvr.ReuseRecord = false
return &Reader{
Reader: csvr,
QuotedComments: true,
Comment: '#',
r: r,
}
}
func (r *Reader) Read() ([]string, error) {
if r.err != nil {
return nil, r.err
}
for {
record, err := r.Reader.Read()
if err != nil {
if r.close {
_ = r.resp.Body.Close()
}
return nil, err
}
if r.QuotedComments && len(record[0]) > 0 {
runeValue, _ := utf8.DecodeRuneInString(record[0])
if runeValue == r.Comment {
last := len(record) - 1
for len(record[last]) == 0 {
last -= 1
}
if last == 0 {
continue
}
}
}
return record, nil
}
}
func (r *Reader) ReadAll() ([][]string, error) {
var records [][]string
for {
record, err := r.Read()
if nil != err {
if errors.Is(err, io.EOF) {
return records, nil
}
return records, err
}
records = append(records, record)
}
}
func ParseIDs(urlStr string) (docid string, gid string) {
// Find key: look for /spreadsheets/d/{key}
const prefix = "/spreadsheets/d/"
startIdx := strings.Index(urlStr, prefix)
if startIdx == -1 {
return "", gid
}
startIdx += len(prefix)
// Find end of key (next / or end of string)
endIdx := strings.Index(urlStr[startIdx:], "/")
if endIdx == -1 {
endIdx = len(urlStr)
} else {
endIdx += startIdx
}
docid = urlStr[startIdx:endIdx]
if docid == "" {
return "", ""
}
// Find gid: look for gid= and take until #, &, ?, /, or end
gidIdx := strings.Index(urlStr, "gid=")
if gidIdx != -1 {
gidStart := gidIdx + len("gid=")
endChars := "#&?/"
gidEnd := strings.IndexAny(urlStr[gidStart:], endChars)
if gidEnd == -1 {
gid = urlStr[gidStart:]
} else {
gid = urlStr[gidStart : gidStart+gidEnd]
}
}
if len(gid) == 0 {
gid = "0"
}
return docid, gid
}
type Writer struct {
*csv.Writer
Comment rune
w io.Writer
}
func NewWriter(w io.Writer) *Writer {
return &Writer{
Writer: csv.NewWriter(w),
Comment: '#',
w: w,
}
}
func (w *Writer) Write(record []string) error {
if len(record) > 1 {
if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 == w.Comment {
w.Flush()
lastNonEmpty := len(record) - 1
for len(record[lastNonEmpty]) == 0 {
lastNonEmpty -= 1
}
if lastNonEmpty == 0 {
record = record[:1]
} else {
for i, f := range record {
if i == 0 || strings.Contains(f, `"`) {
f = strings.ReplaceAll(f, `"`, `""`)
record[i] = `"` + f + `"`
}
}
}
line := strings.Join(record, string(w.Comma))
if _, err := w.w.Write([]byte(line + "\n")); err != nil {
return err
}
return nil
}
}
return w.Writer.Write(record)
}
func (w *Writer) WriteAll(records [][]string) error {
for _, r := range records {
if err := w.Write(r); err != nil {
return err
}
}
w.Flush()
return w.Error()
}