golib/io/transforms/gsheet2csv/gsheet2csv.go

// Authored in 2025 by AJ ONeal <aj@therootcompany.com> (https://therootcompany.com)
//
// To the extent possible under law, the author(s) have dedicated all copyright
// and related and neighboring rights to this software to the public domain
// worldwide. This software is distributed without any warranty.
//
// You should have received a copy of the CC0 Public Domain Dedication along with
// this software. If not, see <https://creativecommons.org/publicdomain/zero/1.0/>.
//
// SPDX-License-Identifier: CC0-1.0

package gsheet2csv

import (
	"encoding/csv"
	"errors"
	"fmt"
	"io"
	"net/http"
	"os"
	"strings"
	"unicode/utf8"
)

var ErrHTTPGet = errors.New("did not get 200 OK when downloading from URL")

// For mocking for tests
var httpGet = http.Get

type Reader struct {
	*csv.Reader
	DocID          string
	GID            string
	URL            string
	QuotedComments bool
	Comment        rune
	r              io.Reader
	resp           *http.Response
	close          bool
	err            error
}

func NewReaderFrom(urlOrPath string) *Reader {
	if strings.HasPrefix(urlOrPath, "https://") || strings.HasPrefix(urlOrPath, "http://") {
		return NewReaderFromURL(urlOrPath)
	}

	urlOrPath = strings.TrimPrefix(urlOrPath, "file://")
	f, err := os.Open(urlOrPath)
	r := NewReader(f)
	r.URL = urlOrPath
	if err != nil {
		r.err = err
	}

	return r
}

func NewReaderFromURL(url string) *Reader {
	docid, gid := ParseIDs(url)

	return NewReaderFromIDs(docid, gid)
}

func NewReaderFromIDs(docid, gid string) *Reader {
	resp, err := GetSheet(docid, gid)
	if err != nil {
		r := NewReader(nil)
		r.err = err
		return r
	}

	r := NewReader(resp.Body)
	r.URL = ToCSVURL(docid, gid)
	r.DocID = docid
	r.GID = gid
	r.resp = resp
	r.close = true
	return r
}

func ToCSVURL(docid, gid string) string {
	return fmt.Sprintf("https://docs.google.com/spreadsheets/d/%s/export?format=csv&usp=sharing&gid=%s", docid, gid)
}

func GetSheet(docid, gid string) (*http.Response, error) {
	downloadURL := ToCSVURL(docid, gid)

	resp, err := httpGet(downloadURL)
	if err != nil {
		return nil, err
	}

	if resp.StatusCode != http.StatusOK {
		_ = resp.Body.Close()
		return nil, ErrHTTPGet
	}

	return resp, nil
}

func NewReader(r io.Reader) *Reader {
	csvr := csv.NewReader(r)
	csvr.Comma = ','
	csvr.Comment = 0          // to allow distinguishing between quoted comments and fields
	csvr.FieldsPerRecord = -1 // Google Sheets is consistent, but our commented files are not
	csvr.LazyQuotes = false   // fields that need quotes use them correctly
	csvr.TrimLeadingSpace = false
	csvr.ReuseRecord = false
	return &Reader{
		Reader:         csvr,
		QuotedComments: true,
		Comment:        '#',
		r:              r,
	}
}

func (r *Reader) Read() ([]string, error) {
	if r.err != nil {
		return nil, r.err
	}

	for {
		record, err := r.Reader.Read()
		if err != nil {
			if r.close {
				_ = r.resp.Body.Close()
			}
			return nil, err
		}

		if r.QuotedComments && len(record[0]) > 0 {
			runeValue, _ := utf8.DecodeRuneInString(record[0])
			if runeValue == r.Comment {
				last := len(record) - 1
				for len(record[last]) == 0 {
					last -= 1
				}
				if last == 0 {
					continue
				}
			}
		}
		return record, nil
	}
}

func (r *Reader) ReadAll() ([][]string, error) {
	var records [][]string

	for {
		record, err := r.Read()
		if nil != err {
			if errors.Is(err, io.EOF) {
				return records, nil
			}
			return records, err
		}
		records = append(records, record)
	}
}

func ParseIDs(urlStr string) (docid string, gid string) {
	// Find key: look for /spreadsheets/d/{key}
	const prefix = "/spreadsheets/d/"
	startIdx := strings.Index(urlStr, prefix)
	if startIdx == -1 {
		return "", gid
	}
	startIdx += len(prefix)

	// Find end of key (next / or end of string)
	endIdx := strings.Index(urlStr[startIdx:], "/")
	if endIdx == -1 {
		endIdx = len(urlStr)
	} else {
		endIdx += startIdx
	}

	docid = urlStr[startIdx:endIdx]
	if docid == "" {
		return "", ""
	}

	// Find gid: look for gid= and take until #, &, ?, /, or end
	gidIdx := strings.Index(urlStr, "gid=")
	if gidIdx != -1 {
		gidStart := gidIdx + len("gid=")
		endChars := "#&?/"
		gidEnd := strings.IndexAny(urlStr[gidStart:], endChars)
		if gidEnd == -1 {
			gid = urlStr[gidStart:]
		} else {
			gid = urlStr[gidStart : gidStart+gidEnd]
		}
	}

	if len(gid) == 0 {
		gid = "0"
	}
	return docid, gid
}

type Writer struct {
	*csv.Writer
	Comment       rune
	QuoteComments bool
	w             io.Writer
}

func NewWriter(w io.Writer) *Writer {
	return &Writer{
		Writer:  csv.NewWriter(w),
		Comment: '#',
		w:       w,
	}
}

func (w *Writer) Write(record []string) error {
	if len(record) > 1 {
		if rv1, _ := utf8.DecodeRuneInString(record[0]); rv1 == w.Comment {
			lastNonEmpty := len(record) - 1
			if lastNonEmpty > -1 {
				for len(record[lastNonEmpty]) == 0 {
					lastNonEmpty -= 1
				}
			}

			first := 0
			if lastNonEmpty == 0 {
				record = record[:1]
				// if !w.QuoteComments {
				// 	return nil
				// }
				first = -1
			}

			for i, f := range record {
				if i == first || strings.Contains(f, `"`) || strings.Contains(f, string(w.Comma)) {
					f = strings.ReplaceAll(f, `"`, `""`)
					record[i] = `"` + f + `"`
				}
			}

			line := strings.Join(record, string(w.Comma))
			w.Flush()
			if _, err := w.w.Write([]byte(line + "\n")); err != nil {
				return err
			}
			return nil
		}
	}

	return w.Writer.Write(record)
}

func QuoteRecord(record []string, comma, comment string) {
	const quote = `"`

	for i, f := range record {
		if (i == 0 && strings.HasPrefix(f, comment)) ||
			(strings.Contains(f, quote) || strings.Contains(f, string(comma))) {

			f = strings.ReplaceAll(f, `"`, `""`)
			record[i] = `"` + f + `"`
		}
	}
}

func (w *Writer) WriteAll(records [][]string) error {
	for _, r := range records {
		if err := w.Write(r); err != nil {
			return err
		}
	}
	w.Flush()
	return w.Error()
}