-
Notifications
You must be signed in to change notification settings - Fork 4
cmd/seed: synthetic dataset for local development #70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ZukwiZ
wants to merge
3
commits into
master
Choose a base branch
from
feat/dev-seed-synthetic
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+344
−1
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,3 +12,6 @@ static/*.db-wal | |
|
|
||
| .secrets | ||
| .env | ||
|
|
||
| internal-docs/ | ||
| internal/devseed/fixtures/ | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| // Command seed loads a deterministic synthetic dataset into the local | ||
| // Reverse Watch Postgres databases for local dashboard development. | ||
| // It is NEVER intended to run in production and refuses to run unless | ||
| // Config.Environment is "development". | ||
| // | ||
| // go run ./cmd/seed | ||
| // | ||
| // The insert uses ON CONFLICT (steam_id, marketplace_slug) DO NOTHING, so | ||
| // re-running is safe: rows that already exist are skipped rather than erroring. | ||
| package main | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "time" | ||
|
|
||
| "reverse-watch/config" | ||
| "reverse-watch/domain/models" | ||
| "reverse-watch/domain/models/constants" | ||
| "reverse-watch/internal/devseed" | ||
| "reverse-watch/logging" | ||
| "reverse-watch/repository/factory" | ||
| "reverse-watch/secret" | ||
| ) | ||
|
|
||
| func main() { | ||
| logging.Initialize() | ||
| cfg := config.Load() | ||
|
|
||
| if cfg.Environment != constants.EnvironmentDevelopment { | ||
| fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment) | ||
| os.Exit(1) | ||
| } | ||
|
|
||
| // Required by factory bootstrap (e.g. admin API key seeding). The | ||
| // synthetic generator pre-populates its own IDs, so the snowflake | ||
| // generator does not actually run for them. | ||
| models.InitSnowflakeGenerator(0, 0) | ||
|
|
||
| keygen := secret.NewKeyGenerator(cfg.Environment) | ||
| f, err := factory.NewFactory(cfg, keygen) | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err) | ||
| os.Exit(1) | ||
| } | ||
| defer func() { | ||
| if err := f.Close(); err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err) | ||
| } | ||
| }() | ||
|
|
||
| reversals := devseed.GenerateSynthetic(time.Now().UTC()) | ||
| fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals)) | ||
|
|
||
| inserted, err := devseed.InsertReversals(f.PublicDB(), reversals) | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err) | ||
| os.Exit(1) | ||
| } | ||
| skipped := int64(len(reversals)) - inserted | ||
| fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,198 @@ | ||
| // Package devseed loads dev-only fixture data into the local Postgres | ||
| // instance. It is intentionally not wired into the main binary — call it | ||
| // from cmd/seed (or a test) when you need realistic data locally. | ||
| package devseed | ||
|
|
||
| import ( | ||
| "math" | ||
| "math/rand" | ||
| "time" | ||
|
|
||
| "reverse-watch/domain/models" | ||
|
|
||
| "gorm.io/gorm" | ||
| "gorm.io/gorm/clause" | ||
| ) | ||
|
|
||
| const ( | ||
| syntheticRNGSeed int64 = 42 | ||
| syntheticDays = 180 | ||
| syntheticTargetTotal int = 2000 | ||
| syntheticBaseSteamID uint64 = 76561198000000000 | ||
| syntheticBaseReporter uint = 2_900_000 | ||
| ) | ||
|
|
||
| var syntheticMarketplaces = []struct { | ||
| slug string | ||
| weight float64 | ||
| }{ | ||
| {"csfloat", 0.80}, | ||
| {"tradeit", 0.10}, | ||
| {"skinport", 0.05}, | ||
| {"swap.gg", 0.05}, | ||
| } | ||
|
|
||
| // GenerateSynthetic returns a ~6-month dataset (~2,000 rows, at least one | ||
| // per day, gentle sinusoid with occasional spikes / quiet days). Snowflake | ||
| // IDs are unique within the slice and won't collide with real CSV-seeded | ||
| // IDs, so callers can pipe the result straight into InsertReversals. | ||
| func GenerateSynthetic(now time.Time) []*models.Reversal { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's normalize
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done — |
||
| now = now.UTC() | ||
| rng := rand.New(rand.NewSource(syntheticRNGSeed)) | ||
| nowMs := uint64(now.UnixMilli()) | ||
| today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) | ||
|
|
||
| counts := make([]int, syntheticDays) | ||
| for d := 0; d < syntheticDays; d++ { | ||
| base := 40.0 + 10.0*math.Sin(float64(d)/30.0) | ||
| var mult float64 | ||
| switch r := rng.Float64(); { | ||
| case r < 0.05: | ||
| mult = 2.5 + rng.Float64()*2.5 | ||
| case r < 0.15: | ||
| mult = 0.2 + rng.Float64()*0.3 | ||
| default: | ||
| mult = 0.7 + rng.Float64()*0.6 | ||
| } | ||
| counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5)) | ||
| } | ||
|
|
||
| total := 0 | ||
| for _, c := range counts { | ||
| total += c | ||
| } | ||
| if total > 0 { | ||
| scale := float64(syntheticTargetTotal) / float64(total) | ||
| for d := range counts { | ||
| counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale))) | ||
| } | ||
| } | ||
|
|
||
| rows := make([]*models.Reversal, 0, syntheticTargetTotal+200) | ||
| var steamOffset uint64 = 1 | ||
| var seq uint16 | ||
|
|
||
| for d := 0; d < syntheticDays; d++ { | ||
| dayStart := today.AddDate(0, 0, -(syntheticDays-1-d)) | ||
| for i := 0; i < counts[d]; i++ { | ||
| reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour))) | ||
| if uint64(reversedAt.UnixMilli()) > nowMs { | ||
| reversedAt = now.Add(-1 * time.Minute) | ||
| } | ||
| reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute | ||
| createdAt := reversedAt.Add(reportDelay) | ||
| if uint64(createdAt.UnixMilli()) > nowMs { | ||
| createdAt = now | ||
| } | ||
|
|
||
| srcRoll := rng.Float64() | ||
| var src models.Source | ||
| var related *models.SteamID | ||
| switch { | ||
| case srcRoll < 0.90: | ||
| src = models.SourceDirect | ||
| case srcRoll < 0.95: | ||
| src = models.SourceRelatedUser | ||
| relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97) | ||
| related = &relID | ||
| default: | ||
| src = models.SourceUserReport | ||
| } | ||
|
|
||
| var expunged *uint64 | ||
| if rng.Float64() < 0.015 { | ||
| eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute) | ||
| if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) { | ||
| ems := uint64(eAt.UnixMilli()) | ||
| expunged = &ems | ||
| } | ||
| } | ||
|
|
||
| steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50))) | ||
| steamOffset++ | ||
|
|
||
| // Snowflake encodes created_at + a 12-bit per-ms sequence; | ||
| // mirrors domain/models/snowflake.go so generated IDs sort | ||
| // chronologically alongside production rows. | ||
| seq = (seq + 1) & 0x0FFF | ||
| // createdAt is always well after models.Epoch for the ~6-month | ||
| // synthetic window, but clamp defensively so an out-of-range | ||
| // createdAt can't underflow the unsigned subtraction into a | ||
| // garbage snowflake (mirrors the guard in genSnowflakeWithParts). | ||
| createdAtMs := uint64(createdAt.UnixMilli()) | ||
| if createdAtMs < models.Epoch { | ||
| createdAtMs = models.Epoch | ||
| } | ||
| sfTs := createdAtMs - models.Epoch | ||
| sf := models.Snowflake((sfTs << 22) | uint64(seq)) | ||
|
cursor[bot] marked this conversation as resolved.
|
||
|
|
||
| reporter := syntheticBaseReporter + uint(steamOffset) | ||
| mp := pickMarketplace(rng) | ||
|
|
||
| rows = append(rows, &models.Reversal{ | ||
| Model: models.Model{ | ||
| ID: sf, | ||
| CreatedAt: uint64(createdAt.UnixMilli()), | ||
| UpdatedAt: uint64(createdAt.UnixMilli()), | ||
| }, | ||
| SteamID: steamID, | ||
| MarketplaceSlug: mp, | ||
| Source: &src, | ||
| RelatedSteamID: related, | ||
| ReversedAt: uint64(reversedAt.UnixMilli()), | ||
| ReporterInternalID: &reporter, | ||
| ExpungedAt: expunged, | ||
| }) | ||
| } | ||
| } | ||
| return rows | ||
| } | ||
|
|
||
| func pickMarketplace(rng *rand.Rand) string { | ||
| r := rng.Float64() | ||
| cum := 0.0 | ||
| for _, mp := range syntheticMarketplaces { | ||
| cum += mp.weight | ||
| if r < cum { | ||
| return mp.slug | ||
| } | ||
| } | ||
| return syntheticMarketplaces[0].slug | ||
| } | ||
|
|
||
| // insertChunkSize keeps each bulk insert under Postgres's 65,535 | ||
| // parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows | ||
| // uses ~11k parameters. | ||
| const insertChunkSize = 1000 | ||
|
|
||
| // InsertReversals bulk-inserts reversals, skipping any row whose | ||
| // (steam_id, marketplace_slug) already exists via ON CONFLICT DO NOTHING. | ||
| // That pair is deterministic across runs (it does not depend on wall-clock | ||
| // time), so re-running the seed is safe and idempotent: already-present rows | ||
| // are skipped instead of raising a unique-constraint error. The conflict | ||
| // target matches the partial unique index created in repository/public | ||
| // (idx_reversals_steam_id_marketplace_slug ... WHERE deleted_at IS NULL). | ||
| // Returns the number of rows actually inserted. | ||
| func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { | ||
| if len(reversals) == 0 { | ||
| return 0, nil | ||
| } | ||
| onConflict := clause.OnConflict{ | ||
| Columns: []clause.Column{{Name: "steam_id"}, {Name: "marketplace_slug"}}, | ||
| TargetWhere: clause.Where{Exprs: []clause.Expression{clause.Expr{SQL: "deleted_at IS NULL"}}}, | ||
| DoNothing: true, | ||
| } | ||
| var inserted int64 | ||
| for i := 0; i < len(reversals); i += insertChunkSize { | ||
| end := i + insertChunkSize | ||
| if end > len(reversals) { | ||
| end = len(reversals) | ||
| } | ||
| res := db.Clauses(onConflict).Create(reversals[i:end]) | ||
| if res.Error != nil { | ||
| return inserted, res.Error | ||
| } | ||
| inserted += res.RowsAffected | ||
| } | ||
| return inserted, nil | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| package devseed | ||
|
|
||
| import ( | ||
| "testing" | ||
| "time" | ||
|
|
||
| "reverse-watch/domain/models" | ||
| "reverse-watch/internal/testutil" | ||
| "reverse-watch/repository/public" | ||
| ) | ||
|
|
||
| // TestInsertReversals_RerunIsSafe guards Finding 1: re-running the seed on a | ||
| // different calendar day produces new snowflake IDs but the same deterministic | ||
| // (steam_id, marketplace_slug) pairs, which collide with the partial unique | ||
| // index. The insert must skip those rows via ON CONFLICT DO NOTHING rather | ||
| // than raising a unique-constraint error. | ||
| func TestInsertReversals_RerunIsSafe(t *testing.T) { | ||
| db := testutil.NewTestDB(t) | ||
| if err := public.CreateIndexes(db); err != nil { | ||
| t.Fatalf("CreateIndexes: %v", err) | ||
| } | ||
|
|
||
| // Both runs use past timestamps (BeforeCreate rejects future | ||
| // reversed_at), but on different calendar days so snowflake IDs differ. | ||
| now := time.Now().UTC().AddDate(0, 0, -10) | ||
| first := GenerateSynthetic(now) | ||
| if _, err := InsertReversals(db, first); err != nil { | ||
| t.Fatalf("first insert: %v", err) | ||
| } | ||
|
|
||
| later := now.AddDate(0, 0, 5) | ||
| second := GenerateSynthetic(later) | ||
| n2, err := InsertReversals(db, second) | ||
| if err != nil { | ||
| t.Fatalf("rerun errored, expected ON CONFLICT DO NOTHING to skip: %v", err) | ||
| } | ||
| if n2 != 0 { | ||
| t.Errorf("rerun inserted %d rows, want 0 (idempotent)", n2) | ||
| } | ||
|
|
||
| if first[0].ID == second[0].ID { | ||
| t.Errorf("expected differing snowflake IDs across days, got %d for both", first[0].ID) | ||
| } | ||
| } | ||
|
|
||
| // TestGenerateSynthetic_NoEpochUnderflow guards Finding 2: generating data | ||
| // with a clock that predates models.Epoch must not underflow the unsigned | ||
| // snowflake timestamp subtraction. Every decoded timestamp must be >= Epoch. | ||
| func TestGenerateSynthetic_NoEpochUnderflow(t *testing.T) { | ||
| // A clock one day before Epoch makes every generated createdAt precede | ||
| // Epoch, so the guard must clamp each snowflake timestamp to exactly | ||
| // Epoch. Without the guard the unsigned subtraction wraps to a huge | ||
| // value, decoding to a timestamp far beyond Epoch. | ||
| beforeEpoch := time.UnixMilli(int64(models.Epoch)).UTC().AddDate(0, 0, -1) | ||
| rows := GenerateSynthetic(beforeEpoch) | ||
| if len(rows) == 0 { | ||
| t.Fatal("expected synthetic rows") | ||
| } | ||
| for _, r := range rows { | ||
| ts := models.ParseSnowflake(r.ID).Timestamp | ||
| if ts != models.Epoch { | ||
| t.Fatalf("snowflake timestamp %d != Epoch %d (underflow not guarded)", ts, models.Epoch) | ||
| } | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.