golib/fs/dataset/dataset.go
AJ ONeal 8731eaf10b
refactor: decouple gitdataset/ipcohort for multi-file repos
gitshallow: fix double-fetch (pull already fetches), drop redundant -C flags
gitdataset: split into GitDataset[T] (file+atomic) and GitRepo (git+multi-dataset)
  - NewDataset for file-only use, AddDataset to register with a GitRepo
  - one clone/fetch per repo regardless of how many datasets it has
ipcohort: split Cohort into hosts (sorted /32, binary search) + nets (CIDRs, linear)
  - fixes false negatives when broad CIDRs (e.g. /8) precede specific entries
  - fixes Parse() sort-before-copy order bug
  - ReadAll always sorts; unsorted param removed (was dead code)
2026-04-19 22:34:25 -06:00

135 lines
3.7 KiB
Go

package dataset
import (
"context"
"fmt"
"os"
"path/filepath"
"sync/atomic"
"time"
"github.com/therootcompany/golib/net/gitshallow"
)
// File holds an atomically-swappable pointer to a value loaded from a file.
// Reads are lock-free. Use NewFile for file-only use, or AddFile to attach
// to a GitRepo so the value refreshes whenever the repo is updated.
type File[T any] struct {
atomic.Pointer[T]
path string
loadFile func(string) (*T, error)
}
// NewFile creates a file-backed dataset with no git dependency.
// Call Reload to do the initial load and after any file change.
func NewFile[T any](path string, loadFile func(string) (*T, error)) *File[T] {
d := &File[T]{
path: path,
loadFile: loadFile,
}
d.Store(new(T))
return d
}
// Reload reads the file and atomically replaces the stored value.
func (d *File[T]) Reload() error {
v, err := d.loadFile(d.path)
if err != nil {
return err
}
d.Store(v)
return nil
}
func (d *File[T]) reloadFile() error {
return d.Reload()
}
// reloader is the internal interface GitRepo uses to trigger file reloads.
type reloader interface {
reloadFile() error
}
// GitRepo manages a shallow git clone and reloads all registered files
// whenever the repo is updated. Multiple files from the same repo share
// one clone and one pull, avoiding git file-lock conflicts.
type GitRepo struct {
path string
shallowRepo *gitshallow.ShallowRepo
files []reloader
}
// NewRepo creates a GitRepo backed by the given git URL, cloning into repoPath.
func NewRepo(gitURL, repoPath string) *GitRepo {
return &GitRepo{
path: repoPath,
shallowRepo: gitshallow.New(gitURL, repoPath, 1, ""),
}
}
// AddFile registers a file inside this repo and returns its handle.
// relPath is relative to the repo root. The file is reloaded automatically
// whenever the repo is synced via Init or Run.
func AddFile[T any](repo *GitRepo, relPath string, loadFile func(string) (*T, error)) *File[T] {
d := NewFile(filepath.Join(repo.path, relPath), loadFile)
repo.files = append(repo.files, d)
return d
}
// Init clones the repo if missing, syncs once, and loads all registered files.
// Always runs aggressive GC — acceptable as a one-time startup cost.
func (r *GitRepo) Init() error {
gitDir := filepath.Join(r.path, ".git")
if _, err := os.Stat(gitDir); err != nil {
if _, err := r.shallowRepo.Clone(); err != nil {
return err
}
}
_, err := r.sync(false, true)
return err
}
// Run periodically syncs the repo and reloads files. Blocks until ctx is done.
// lightGC=false (zero value) runs aggressive GC with immediate pruning to keep footprint minimal.
// Pass true to skip both when the periodic GC is too slow for your workload.
func (r *GitRepo) Run(ctx context.Context, lightGC bool) {
ticker := time.NewTicker(47 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if updated, err := r.sync(lightGC, false); err != nil {
fmt.Fprintf(os.Stderr, "error: git repo sync: %v\n", err)
} else if updated {
fmt.Fprintf(os.Stderr, "git repo: files reloaded\n")
}
case <-ctx.Done():
return
}
}
}
// Sync pulls the latest commits and reloads all files if HEAD changed.
// lightGC=false (zero value) runs aggressive GC with immediate pruning to keep footprint minimal.
func (r *GitRepo) Sync(lightGC bool) (bool, error) {
return r.sync(lightGC, false)
}
func (r *GitRepo) sync(lightGC, force bool) (bool, error) {
updated, err := r.shallowRepo.Sync(lightGC)
if err != nil {
return false, fmt.Errorf("git sync: %w", err)
}
if !updated && !force {
return false, nil
}
for _, f := range r.files {
if err := f.reloadFile(); err != nil {
fmt.Fprintf(os.Stderr, "error: reload file: %v\n", err)
}
}
return true, nil
}