Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ static/*.db-wal

.secrets
.env

internal-docs/
internal/devseed/fixtures/
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,19 @@ API keys are scoped to an entity and carry a permission bitfield. Keys are prefi

## Rate Limiting

Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers.
Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers.

## Seeding local data

Production ingests live data from contributing marketplaces. For local development, a synthetic dataset (~6 months, ~2k rows with realistic daily variance) can be loaded with:

```bash
go run ./cmd/seed
```

The seed:

- Must be run in a development environment.
- Generates a 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y).
- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake.
- Is safe to re-run: rows are de-duplicated on `(steam_id, marketplace_slug)` via `ON CONFLICT DO NOTHING`, so a rerun skips rows that already exist instead of raising a unique-constraint error.
62 changes: 62 additions & 0 deletions cmd/seed/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Command seed loads a deterministic synthetic dataset into the local
// Reverse Watch Postgres databases for local dashboard development.
// It is NEVER intended to run in production and refuses to run unless
// Config.Environment is "development".
//
// go run ./cmd/seed
//
// The insert uses ON CONFLICT (steam_id, marketplace_slug) DO NOTHING, so
// re-running is safe: rows that already exist are skipped rather than erroring.
package main

import (
"fmt"
"os"
"time"

"reverse-watch/config"
"reverse-watch/domain/models"
"reverse-watch/domain/models/constants"
"reverse-watch/internal/devseed"
"reverse-watch/logging"
"reverse-watch/repository/factory"
"reverse-watch/secret"
)

func main() {
logging.Initialize()
cfg := config.Load()

if cfg.Environment != constants.EnvironmentDevelopment {
fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment)
os.Exit(1)
}

// Required by factory bootstrap (e.g. admin API key seeding). The
// synthetic generator pre-populates its own IDs, so the snowflake
// generator does not actually run for them.
models.InitSnowflakeGenerator(0, 0)

keygen := secret.NewKeyGenerator(cfg.Environment)
f, err := factory.NewFactory(cfg, keygen)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err)
os.Exit(1)
}
defer func() {
if err := f.Close(); err != nil {
fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err)
}
}()

reversals := devseed.GenerateSynthetic(time.Now().UTC())
Comment thread
cursor[bot] marked this conversation as resolved.
fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals))

inserted, err := devseed.InsertReversals(f.PublicDB(), reversals)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err)
os.Exit(1)
}
skipped := int64(len(reversals)) - inserted
fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped)
}
198 changes: 198 additions & 0 deletions internal/devseed/synthetic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
// Package devseed loads dev-only fixture data into the local Postgres
// instance. It is intentionally not wired into the main binary — call it
// from cmd/seed (or a test) when you need realistic data locally.
package devseed

import (
"math"
"math/rand"
"time"

"reverse-watch/domain/models"

"gorm.io/gorm"
"gorm.io/gorm/clause"
)

const (
syntheticRNGSeed int64 = 42
syntheticDays = 180
syntheticTargetTotal int = 2000
syntheticBaseSteamID uint64 = 76561198000000000
syntheticBaseReporter uint = 2_900_000
)

var syntheticMarketplaces = []struct {
slug string
weight float64
}{
{"csfloat", 0.80},
{"tradeit", 0.10},
{"skinport", 0.05},
{"swap.gg", 0.05},
}

// GenerateSynthetic returns a ~6-month dataset (~2,000 rows, at least one
// per day, gentle sinusoid with occasional spikes / quiet days). Snowflake
// IDs are unique within the slice and won't collide with real CSV-seeded
// IDs, so callers can pipe the result straight into InsertReversals.
func GenerateSynthetic(now time.Time) []*models.Reversal {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's normalize now() to UTC before we derive any dates.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — now() is normalized to UTC before any dates are derived.

now = now.UTC()
rng := rand.New(rand.NewSource(syntheticRNGSeed))
nowMs := uint64(now.UnixMilli())
today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC)

counts := make([]int, syntheticDays)
for d := 0; d < syntheticDays; d++ {
base := 40.0 + 10.0*math.Sin(float64(d)/30.0)
var mult float64
switch r := rng.Float64(); {
case r < 0.05:
mult = 2.5 + rng.Float64()*2.5
case r < 0.15:
mult = 0.2 + rng.Float64()*0.3
default:
mult = 0.7 + rng.Float64()*0.6
}
counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5))
}

total := 0
for _, c := range counts {
total += c
}
if total > 0 {
scale := float64(syntheticTargetTotal) / float64(total)
for d := range counts {
counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale)))
}
}

rows := make([]*models.Reversal, 0, syntheticTargetTotal+200)
var steamOffset uint64 = 1
var seq uint16

for d := 0; d < syntheticDays; d++ {
dayStart := today.AddDate(0, 0, -(syntheticDays-1-d))
for i := 0; i < counts[d]; i++ {
reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour)))
if uint64(reversedAt.UnixMilli()) > nowMs {
reversedAt = now.Add(-1 * time.Minute)
}
reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute
createdAt := reversedAt.Add(reportDelay)
if uint64(createdAt.UnixMilli()) > nowMs {
createdAt = now
}

srcRoll := rng.Float64()
var src models.Source
var related *models.SteamID
switch {
case srcRoll < 0.90:
src = models.SourceDirect
case srcRoll < 0.95:
src = models.SourceRelatedUser
relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97)
related = &relID
default:
src = models.SourceUserReport
}

var expunged *uint64
if rng.Float64() < 0.015 {
eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute)
if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) {
ems := uint64(eAt.UnixMilli())
expunged = &ems
}
}

steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50)))
steamOffset++

// Snowflake encodes created_at + a 12-bit per-ms sequence;
// mirrors domain/models/snowflake.go so generated IDs sort
// chronologically alongside production rows.
seq = (seq + 1) & 0x0FFF
// createdAt is always well after models.Epoch for the ~6-month
// synthetic window, but clamp defensively so an out-of-range
// createdAt can't underflow the unsigned subtraction into a
// garbage snowflake (mirrors the guard in genSnowflakeWithParts).
createdAtMs := uint64(createdAt.UnixMilli())
if createdAtMs < models.Epoch {
createdAtMs = models.Epoch
}
sfTs := createdAtMs - models.Epoch
sf := models.Snowflake((sfTs << 22) | uint64(seq))
Comment thread
cursor[bot] marked this conversation as resolved.

reporter := syntheticBaseReporter + uint(steamOffset)
mp := pickMarketplace(rng)

rows = append(rows, &models.Reversal{
Model: models.Model{
ID: sf,
CreatedAt: uint64(createdAt.UnixMilli()),
UpdatedAt: uint64(createdAt.UnixMilli()),
},
SteamID: steamID,
MarketplaceSlug: mp,
Source: &src,
RelatedSteamID: related,
ReversedAt: uint64(reversedAt.UnixMilli()),
ReporterInternalID: &reporter,
ExpungedAt: expunged,
})
}
}
return rows
}

func pickMarketplace(rng *rand.Rand) string {
r := rng.Float64()
cum := 0.0
for _, mp := range syntheticMarketplaces {
cum += mp.weight
if r < cum {
return mp.slug
}
}
return syntheticMarketplaces[0].slug
}

// insertChunkSize keeps each bulk insert under Postgres's 65,535
// parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows
// uses ~11k parameters.
const insertChunkSize = 1000

// InsertReversals bulk-inserts reversals, skipping any row whose
// (steam_id, marketplace_slug) already exists via ON CONFLICT DO NOTHING.
// That pair is deterministic across runs (it does not depend on wall-clock
// time), so re-running the seed is safe and idempotent: already-present rows
// are skipped instead of raising a unique-constraint error. The conflict
// target matches the partial unique index created in repository/public
// (idx_reversals_steam_id_marketplace_slug ... WHERE deleted_at IS NULL).
// Returns the number of rows actually inserted.
func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) {
if len(reversals) == 0 {
return 0, nil
}
onConflict := clause.OnConflict{
Columns: []clause.Column{{Name: "steam_id"}, {Name: "marketplace_slug"}},
TargetWhere: clause.Where{Exprs: []clause.Expression{clause.Expr{SQL: "deleted_at IS NULL"}}},
DoNothing: true,
}
var inserted int64
for i := 0; i < len(reversals); i += insertChunkSize {
end := i + insertChunkSize
if end > len(reversals) {
end = len(reversals)
}
res := db.Clauses(onConflict).Create(reversals[i:end])
if res.Error != nil {
return inserted, res.Error
}
inserted += res.RowsAffected
}
return inserted, nil
}
65 changes: 65 additions & 0 deletions internal/devseed/synthetic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package devseed

import (
"testing"
"time"

"reverse-watch/domain/models"
"reverse-watch/internal/testutil"
"reverse-watch/repository/public"
)

// TestInsertReversals_RerunIsSafe guards Finding 1: re-running the seed on a
// different calendar day produces new snowflake IDs but the same deterministic
// (steam_id, marketplace_slug) pairs, which collide with the partial unique
// index. The insert must skip those rows via ON CONFLICT DO NOTHING rather
// than raising a unique-constraint error.
func TestInsertReversals_RerunIsSafe(t *testing.T) {
db := testutil.NewTestDB(t)
if err := public.CreateIndexes(db); err != nil {
t.Fatalf("CreateIndexes: %v", err)
}

// Both runs use past timestamps (BeforeCreate rejects future
// reversed_at), but on different calendar days so snowflake IDs differ.
now := time.Now().UTC().AddDate(0, 0, -10)
first := GenerateSynthetic(now)
if _, err := InsertReversals(db, first); err != nil {
t.Fatalf("first insert: %v", err)
}

later := now.AddDate(0, 0, 5)
second := GenerateSynthetic(later)
n2, err := InsertReversals(db, second)
if err != nil {
t.Fatalf("rerun errored, expected ON CONFLICT DO NOTHING to skip: %v", err)
}
if n2 != 0 {
t.Errorf("rerun inserted %d rows, want 0 (idempotent)", n2)
}

if first[0].ID == second[0].ID {
t.Errorf("expected differing snowflake IDs across days, got %d for both", first[0].ID)
}
}

// TestGenerateSynthetic_NoEpochUnderflow guards Finding 2: generating data
// with a clock that predates models.Epoch must not underflow the unsigned
// snowflake timestamp subtraction. Every decoded timestamp must be >= Epoch.
func TestGenerateSynthetic_NoEpochUnderflow(t *testing.T) {
// A clock one day before Epoch makes every generated createdAt precede
// Epoch, so the guard must clamp each snowflake timestamp to exactly
// Epoch. Without the guard the unsigned subtraction wraps to a huge
// value, decoding to a timestamp far beyond Epoch.
beforeEpoch := time.UnixMilli(int64(models.Epoch)).UTC().AddDate(0, 0, -1)
rows := GenerateSynthetic(beforeEpoch)
if len(rows) == 0 {
t.Fatal("expected synthetic rows")
}
for _, r := range rows {
ts := models.ParseSnowflake(r.ID).Timestamp
if ts != models.Epoch {
t.Fatalf("snowflake timestamp %d != Epoch %d (underflow not guarded)", ts, models.Epoch)
}
}
}
Loading