diff --git a/.gitignore b/.gitignore index 595dfa4..4e1454c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ static/*.db-wal .secrets .env + +internal-docs/ +internal/devseed/fixtures/ diff --git a/README.md b/README.md index 1926b7f..1994586 100644 --- a/README.md +++ b/README.md @@ -47,4 +47,19 @@ API keys are scoped to an entity and carry a permission bitfield. Keys are prefi ## Rate Limiting -Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers. \ No newline at end of file +Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers. + +## Seeding local data + +Production ingests live data from contributing marketplaces. For local development, a synthetic dataset (~6 months, ~2k rows with realistic daily variance) can be loaded with: + +```bash +go run ./cmd/seed +``` + +The seed: + +- Must be run in a development environment. +- Generates a 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). +- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. +- Is safe to re-run: rows are de-duplicated on `(steam_id, marketplace_slug)` via `ON CONFLICT DO NOTHING`, so a rerun skips rows that already exist instead of raising a unique-constraint error. \ No newline at end of file diff --git a/cmd/seed/main.go b/cmd/seed/main.go new file mode 100644 index 0000000..1921e77 --- /dev/null +++ b/cmd/seed/main.go @@ -0,0 +1,62 @@ +// Command seed loads a deterministic synthetic dataset into the local +// Reverse Watch Postgres databases for local dashboard development. +// It is NEVER intended to run in production and refuses to run unless +// Config.Environment is "development". +// +// go run ./cmd/seed +// +// The insert uses ON CONFLICT (steam_id, marketplace_slug) DO NOTHING, so +// re-running is safe: rows that already exist are skipped rather than erroring. +package main + +import ( + "fmt" + "os" + "time" + + "reverse-watch/config" + "reverse-watch/domain/models" + "reverse-watch/domain/models/constants" + "reverse-watch/internal/devseed" + "reverse-watch/logging" + "reverse-watch/repository/factory" + "reverse-watch/secret" +) + +func main() { + logging.Initialize() + cfg := config.Load() + + if cfg.Environment != constants.EnvironmentDevelopment { + fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment) + os.Exit(1) + } + + // Required by factory bootstrap (e.g. admin API key seeding). The + // synthetic generator pre-populates its own IDs, so the snowflake + // generator does not actually run for them. + models.InitSnowflakeGenerator(0, 0) + + keygen := secret.NewKeyGenerator(cfg.Environment) + f, err := factory.NewFactory(cfg, keygen) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err) + os.Exit(1) + } + defer func() { + if err := f.Close(); err != nil { + fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err) + } + }() + + reversals := devseed.GenerateSynthetic(time.Now().UTC()) + fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals)) + + inserted, err := devseed.InsertReversals(f.PublicDB(), reversals) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err) + os.Exit(1) + } + skipped := int64(len(reversals)) - inserted + fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped) +} diff --git a/internal/devseed/synthetic.go b/internal/devseed/synthetic.go new file mode 100644 index 0000000..6cdc5ff --- /dev/null +++ b/internal/devseed/synthetic.go @@ -0,0 +1,198 @@ +// Package devseed loads dev-only fixture data into the local Postgres +// instance. It is intentionally not wired into the main binary — call it +// from cmd/seed (or a test) when you need realistic data locally. +package devseed + +import ( + "math" + "math/rand" + "time" + + "reverse-watch/domain/models" + + "gorm.io/gorm" + "gorm.io/gorm/clause" +) + +const ( + syntheticRNGSeed int64 = 42 + syntheticDays = 180 + syntheticTargetTotal int = 2000 + syntheticBaseSteamID uint64 = 76561198000000000 + syntheticBaseReporter uint = 2_900_000 +) + +var syntheticMarketplaces = []struct { + slug string + weight float64 +}{ + {"csfloat", 0.80}, + {"tradeit", 0.10}, + {"skinport", 0.05}, + {"swap.gg", 0.05}, +} + +// GenerateSynthetic returns a ~6-month dataset (~2,000 rows, at least one +// per day, gentle sinusoid with occasional spikes / quiet days). Snowflake +// IDs are unique within the slice and won't collide with real CSV-seeded +// IDs, so callers can pipe the result straight into InsertReversals. +func GenerateSynthetic(now time.Time) []*models.Reversal { + now = now.UTC() + rng := rand.New(rand.NewSource(syntheticRNGSeed)) + nowMs := uint64(now.UnixMilli()) + today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) + + counts := make([]int, syntheticDays) + for d := 0; d < syntheticDays; d++ { + base := 40.0 + 10.0*math.Sin(float64(d)/30.0) + var mult float64 + switch r := rng.Float64(); { + case r < 0.05: + mult = 2.5 + rng.Float64()*2.5 + case r < 0.15: + mult = 0.2 + rng.Float64()*0.3 + default: + mult = 0.7 + rng.Float64()*0.6 + } + counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5)) + } + + total := 0 + for _, c := range counts { + total += c + } + if total > 0 { + scale := float64(syntheticTargetTotal) / float64(total) + for d := range counts { + counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale))) + } + } + + rows := make([]*models.Reversal, 0, syntheticTargetTotal+200) + var steamOffset uint64 = 1 + var seq uint16 + + for d := 0; d < syntheticDays; d++ { + dayStart := today.AddDate(0, 0, -(syntheticDays-1-d)) + for i := 0; i < counts[d]; i++ { + reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour))) + if uint64(reversedAt.UnixMilli()) > nowMs { + reversedAt = now.Add(-1 * time.Minute) + } + reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute + createdAt := reversedAt.Add(reportDelay) + if uint64(createdAt.UnixMilli()) > nowMs { + createdAt = now + } + + srcRoll := rng.Float64() + var src models.Source + var related *models.SteamID + switch { + case srcRoll < 0.90: + src = models.SourceDirect + case srcRoll < 0.95: + src = models.SourceRelatedUser + relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97) + related = &relID + default: + src = models.SourceUserReport + } + + var expunged *uint64 + if rng.Float64() < 0.015 { + eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute) + if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) { + ems := uint64(eAt.UnixMilli()) + expunged = &ems + } + } + + steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50))) + steamOffset++ + + // Snowflake encodes created_at + a 12-bit per-ms sequence; + // mirrors domain/models/snowflake.go so generated IDs sort + // chronologically alongside production rows. + seq = (seq + 1) & 0x0FFF + // createdAt is always well after models.Epoch for the ~6-month + // synthetic window, but clamp defensively so an out-of-range + // createdAt can't underflow the unsigned subtraction into a + // garbage snowflake (mirrors the guard in genSnowflakeWithParts). + createdAtMs := uint64(createdAt.UnixMilli()) + if createdAtMs < models.Epoch { + createdAtMs = models.Epoch + } + sfTs := createdAtMs - models.Epoch + sf := models.Snowflake((sfTs << 22) | uint64(seq)) + + reporter := syntheticBaseReporter + uint(steamOffset) + mp := pickMarketplace(rng) + + rows = append(rows, &models.Reversal{ + Model: models.Model{ + ID: sf, + CreatedAt: uint64(createdAt.UnixMilli()), + UpdatedAt: uint64(createdAt.UnixMilli()), + }, + SteamID: steamID, + MarketplaceSlug: mp, + Source: &src, + RelatedSteamID: related, + ReversedAt: uint64(reversedAt.UnixMilli()), + ReporterInternalID: &reporter, + ExpungedAt: expunged, + }) + } + } + return rows +} + +func pickMarketplace(rng *rand.Rand) string { + r := rng.Float64() + cum := 0.0 + for _, mp := range syntheticMarketplaces { + cum += mp.weight + if r < cum { + return mp.slug + } + } + return syntheticMarketplaces[0].slug +} + +// insertChunkSize keeps each bulk insert under Postgres's 65,535 +// parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows +// uses ~11k parameters. +const insertChunkSize = 1000 + +// InsertReversals bulk-inserts reversals, skipping any row whose +// (steam_id, marketplace_slug) already exists via ON CONFLICT DO NOTHING. +// That pair is deterministic across runs (it does not depend on wall-clock +// time), so re-running the seed is safe and idempotent: already-present rows +// are skipped instead of raising a unique-constraint error. The conflict +// target matches the partial unique index created in repository/public +// (idx_reversals_steam_id_marketplace_slug ... WHERE deleted_at IS NULL). +// Returns the number of rows actually inserted. +func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { + if len(reversals) == 0 { + return 0, nil + } + onConflict := clause.OnConflict{ + Columns: []clause.Column{{Name: "steam_id"}, {Name: "marketplace_slug"}}, + TargetWhere: clause.Where{Exprs: []clause.Expression{clause.Expr{SQL: "deleted_at IS NULL"}}}, + DoNothing: true, + } + var inserted int64 + for i := 0; i < len(reversals); i += insertChunkSize { + end := i + insertChunkSize + if end > len(reversals) { + end = len(reversals) + } + res := db.Clauses(onConflict).Create(reversals[i:end]) + if res.Error != nil { + return inserted, res.Error + } + inserted += res.RowsAffected + } + return inserted, nil +} diff --git a/internal/devseed/synthetic_test.go b/internal/devseed/synthetic_test.go new file mode 100644 index 0000000..5eecc02 --- /dev/null +++ b/internal/devseed/synthetic_test.go @@ -0,0 +1,65 @@ +package devseed + +import ( + "testing" + "time" + + "reverse-watch/domain/models" + "reverse-watch/internal/testutil" + "reverse-watch/repository/public" +) + +// TestInsertReversals_RerunIsSafe guards Finding 1: re-running the seed on a +// different calendar day produces new snowflake IDs but the same deterministic +// (steam_id, marketplace_slug) pairs, which collide with the partial unique +// index. The insert must skip those rows via ON CONFLICT DO NOTHING rather +// than raising a unique-constraint error. +func TestInsertReversals_RerunIsSafe(t *testing.T) { + db := testutil.NewTestDB(t) + if err := public.CreateIndexes(db); err != nil { + t.Fatalf("CreateIndexes: %v", err) + } + + // Both runs use past timestamps (BeforeCreate rejects future + // reversed_at), but on different calendar days so snowflake IDs differ. + now := time.Now().UTC().AddDate(0, 0, -10) + first := GenerateSynthetic(now) + if _, err := InsertReversals(db, first); err != nil { + t.Fatalf("first insert: %v", err) + } + + later := now.AddDate(0, 0, 5) + second := GenerateSynthetic(later) + n2, err := InsertReversals(db, second) + if err != nil { + t.Fatalf("rerun errored, expected ON CONFLICT DO NOTHING to skip: %v", err) + } + if n2 != 0 { + t.Errorf("rerun inserted %d rows, want 0 (idempotent)", n2) + } + + if first[0].ID == second[0].ID { + t.Errorf("expected differing snowflake IDs across days, got %d for both", first[0].ID) + } +} + +// TestGenerateSynthetic_NoEpochUnderflow guards Finding 2: generating data +// with a clock that predates models.Epoch must not underflow the unsigned +// snowflake timestamp subtraction. Every decoded timestamp must be >= Epoch. +func TestGenerateSynthetic_NoEpochUnderflow(t *testing.T) { + // A clock one day before Epoch makes every generated createdAt precede + // Epoch, so the guard must clamp each snowflake timestamp to exactly + // Epoch. Without the guard the unsigned subtraction wraps to a huge + // value, decoding to a timestamp far beyond Epoch. + beforeEpoch := time.UnixMilli(int64(models.Epoch)).UTC().AddDate(0, 0, -1) + rows := GenerateSynthetic(beforeEpoch) + if len(rows) == 0 { + t.Fatal("expected synthetic rows") + } + for _, r := range rows { + ts := models.ParseSnowflake(r.ID).Timestamp + if ts != models.Epoch { + t.Fatalf("snowflake timestamp %d != Epoch %d (underflow not guarded)", ts, models.Epoch) + } + } +}