apollo-backend/internal/cmd/scheduler.go

544 lines
14 KiB
Go
Raw Normal View History

package cmd
2021-07-08 23:03:46 +00:00
import (
"context"
"fmt"
2023-03-16 16:01:04 +00:00
"math"
2022-05-27 17:27:19 +00:00
"net/http"
_ "net/http/pprof"
"strconv"
2022-06-04 14:21:29 +00:00
"sync"
2021-07-08 23:03:46 +00:00
"time"
2021-07-09 02:09:14 +00:00
"github.com/DataDog/datadog-go/statsd"
2022-11-01 23:02:25 +00:00
"github.com/adjust/rmq/v5"
2021-07-08 23:03:46 +00:00
"github.com/go-co-op/gocron"
"github.com/go-redis/redis/v8"
2023-03-24 17:12:09 +00:00
"github.com/jackc/pgx/v5/pgxpool"
"github.com/spf13/cobra"
2022-05-23 18:17:25 +00:00
"go.uber.org/zap"
2021-08-14 16:08:17 +00:00
2022-03-28 21:05:01 +00:00
"github.com/christianselig/apollo-backend/internal/cmdutil"
"github.com/christianselig/apollo-backend/internal/domain"
"github.com/christianselig/apollo-backend/internal/repository"
2021-07-09 03:12:50 +00:00
)
const batchSize = 250
2023-03-16 14:01:58 +00:00
const accountEnqueueSeconds = 60
2022-03-28 21:05:01 +00:00
func SchedulerCmd(ctx context.Context) *cobra.Command {
cmd := &cobra.Command{
Use: "scheduler",
Args: cobra.ExactArgs(0),
Short: "Schedules jobs and runs several maintenance tasks periodically.",
RunE: func(cmd *cobra.Command, args []string) error {
2022-05-23 18:29:15 +00:00
logger := cmdutil.NewLogger("scheduler")
2022-05-23 18:17:25 +00:00
defer func() { _ = logger.Sync() }()
statsd, err := cmdutil.NewStatsdClient()
if err != nil {
return fmt.Errorf("could not initialize statsd: %w", err)
}
defer statsd.Close()
2021-07-20 17:00:53 +00:00
db, err := cmdutil.NewDatabasePool(ctx, 1)
if err != nil {
return fmt.Errorf("could not connect to database: %w", err)
}
defer db.Close()
2022-11-06 00:01:35 +00:00
redis, err := cmdutil.NewRedisLocksClient(ctx, 64)
if err != nil {
return fmt.Errorf("could not connect to redis locks: %w", err)
}
defer redis.Close()
qredis, err := cmdutil.NewRedisQueueClient(ctx, 16)
if err != nil {
return fmt.Errorf("could not connect to redis queues: %w", err)
}
defer qredis.Close()
queue, err := cmdutil.NewQueueClient(logger, qredis, "worker")
if err != nil {
return err
}
// Eval lua so that we don't keep parsing it
luaSha, err := evalScript(ctx, redis)
if err != nil {
return err
}
notifQueue, err := queue.OpenQueue("notifications")
if err != nil {
return err
}
2021-09-25 16:56:01 +00:00
subredditQueue, err := queue.OpenQueue("subreddits")
if err != nil {
return err
}
2021-10-10 15:51:42 +00:00
trendingQueue, err := queue.OpenQueue("trending")
if err != nil {
return err
}
2021-10-09 14:59:20 +00:00
userQueue, err := queue.OpenQueue("users")
if err != nil {
return err
}
2021-10-17 14:17:41 +00:00
stuckNotificationsQueue, err := queue.OpenQueue("stuck-notifications")
if err != nil {
return err
}
2022-10-19 13:37:41 +00:00
liveActivitiesQueue, err := queue.OpenQueue("live-activities")
if err != nil {
return err
}
s := gocron.NewScheduler(time.UTC)
2022-10-27 14:09:40 +00:00
s.SetMaxConcurrentJobs(8, gocron.WaitMode)
eaj, _ := s.Every(5).Seconds().Do(func() { enqueueAccounts(ctx, logger, statsd, db, redis, luaSha, notifQueue) })
2022-11-01 17:14:33 +00:00
eaj.SingletonMode()
_, _ = s.Every(5).Seconds().Do(func() { enqueueSubreddits(ctx, logger, statsd, db, []rmq.Queue{subredditQueue, trendingQueue}) })
_, _ = s.Every(5).Seconds().Do(func() { enqueueUsers(ctx, logger, statsd, db, userQueue) })
_, _ = s.Every(5).Seconds().Do(func() { enqueueLiveActivities(ctx, logger, db, redis, luaSha, liveActivitiesQueue) })
_, _ = s.Every(5).Seconds().Do(func() { cleanQueues(logger, queue) })
_, _ = s.Every(5).Seconds().Do(func() { enqueueStuckAccounts(ctx, logger, statsd, db, stuckNotificationsQueue) })
_, _ = s.Every(1).Minute().Do(func() { reportStats(ctx, logger, statsd, db) })
2022-07-31 19:07:14 +00:00
//_, _ = s.Every(1).Minute().Do(func() { pruneAccounts(ctx, logger, db) })
//_, _ = s.Every(1).Minute().Do(func() { pruneDevices(ctx, logger, db) })
s.StartAsync()
2022-05-27 17:27:19 +00:00
srv := &http.Server{Addr: ":8080"}
go func() { _ = srv.ListenAndServe() }()
<-ctx.Done()
s.Stop()
return nil
},
2021-07-08 23:03:46 +00:00
}
return cmd
2021-07-08 23:03:46 +00:00
}
func evalScript(ctx context.Context, redis *redis.Client) (string, error) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
lua := fmt.Sprintf(`
local retv={}
for i=1, #ARGV do
local key = KEYS[1] .. ":" .. ARGV[i]
if redis.call("set", key, 1, "nx", "ex", %.0f) then
retv[#retv + 1] = ARGV[i]
end
end
return retv
2022-05-22 23:57:29 +00:00
`, domain.NotificationCheckTimeout.Seconds())
return redis.ScriptLoad(ctx, lua).Result()
}
2022-10-19 13:37:41 +00:00
func enqueueLiveActivities(ctx context.Context, logger *zap.Logger, pool *pgxpool.Pool, redisConn *redis.Client, luaSha string, queue rmq.Queue) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2022-10-27 19:47:26 +00:00
now := time.Now()
2022-10-19 13:37:41 +00:00
next := now.Add(domain.LiveActivityCheckInterval)
stmt := `UPDATE live_activities
SET next_check_at = $2
WHERE id IN (
SELECT id
FROM live_activities
WHERE next_check_at < $1
ORDER BY next_check_at
FOR UPDATE SKIP LOCKED
2022-10-27 02:55:08 +00:00
LIMIT 1000
2022-10-19 13:37:41 +00:00
)
RETURNING live_activities.apns_token`
ats := []string{}
rows, err := pool.Query(ctx, stmt, now, next)
if err != nil {
logger.Error("failed to fetch batch of live activities", zap.Error(err))
return
}
for rows.Next() {
var at string
_ = rows.Scan(&at)
ats = append(ats, at)
}
rows.Close()
if len(ats) == 0 {
return
}
batch, err := redisConn.EvalSha(ctx, luaSha, []string{"locks:live-activities"}, ats).StringSlice()
if err != nil {
logger.Error("failed to lock live activities", zap.Error(err))
return
}
if len(batch) == 0 {
return
}
logger.Debug("enqueueing live activity batch", zap.Int("count", len(batch)), zap.Time("start", now))
if err = queue.Publish(batch...); err != nil {
logger.Error("failed to enqueue live activity batch", zap.Error(err))
}
}
2022-05-23 18:17:25 +00:00
func pruneAccounts(ctx context.Context, logger *zap.Logger, pool *pgxpool.Pool) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2022-03-28 21:05:01 +00:00
expiry := time.Now().Add(-domain.StaleTokenThreshold)
2021-08-14 15:54:48 +00:00
ar := repository.NewPostgresAccount(pool)
2022-03-28 21:05:01 +00:00
stale, err := ar.PruneStale(ctx, expiry)
2021-08-14 15:54:48 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to clean stale accounts", zap.Error(err))
2021-08-14 15:54:48 +00:00
return
}
2021-08-14 15:59:13 +00:00
orphaned, err := ar.PruneOrphaned(ctx)
2021-07-12 19:36:22 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to clean orphaned accounts", zap.Error(err))
2021-07-12 19:36:22 +00:00
return
}
if count := stale + orphaned; count > 0 {
2022-05-23 18:17:25 +00:00
logger.Info("pruned accounts", zap.Int64("stale", stale), zap.Int64("orphaned", orphaned))
2021-07-23 00:22:46 +00:00
}
2021-07-12 19:36:22 +00:00
}
2022-05-23 18:17:25 +00:00
func pruneDevices(ctx context.Context, logger *zap.Logger, pool *pgxpool.Pool) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2022-03-28 21:05:01 +00:00
now := time.Now()
2021-08-14 16:08:17 +00:00
dr := repository.NewPostgresDevice(pool)
2022-03-28 21:05:01 +00:00
count, err := dr.PruneStale(ctx, now)
2021-08-14 16:08:17 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to clean stale devices", zap.Error(err))
2021-08-14 16:08:17 +00:00
return
}
if count > 0 {
2022-05-23 18:17:25 +00:00
logger.Info("pruned devices", zap.Int64("count", count))
2021-08-14 16:08:17 +00:00
}
}
2022-05-23 18:17:25 +00:00
func cleanQueues(logger *zap.Logger, jobsConn rmq.Connection) {
2021-07-12 19:36:22 +00:00
cleaner := rmq.NewCleaner(jobsConn)
count, err := cleaner.Clean()
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to clean jobs from queues", zap.Error(err))
2021-07-12 19:36:22 +00:00
return
}
2021-10-17 14:17:41 +00:00
if count > 0 {
2022-05-23 18:17:25 +00:00
logger.Info("returned jobs to queues", zap.Int64("count", count))
2021-10-17 14:17:41 +00:00
}
2021-07-12 19:36:22 +00:00
}
2022-05-23 18:17:25 +00:00
func reportStats(ctx context.Context, logger *zap.Logger, statsd *statsd.Client, pool *pgxpool.Pool) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
var (
count int64
metrics = []struct {
query string
name string
}{
{"SELECT COUNT(*) FROM accounts", "apollo.registrations.accounts"},
{"SELECT COUNT(*) FROM devices", "apollo.registrations.devices"},
2021-10-17 14:17:41 +00:00
{"SELECT COUNT(*) FROM subreddits", "apollo.registrations.subreddits"},
{"SELECT COUNT(*) FROM users", "apollo.registrations.users"},
2022-10-27 00:36:03 +00:00
{"SELECT COUNT(*) FROM live_activities", "apollo.registrations.live-activities"},
}
)
for _, metric := range metrics {
2021-09-25 13:19:42 +00:00
_ = pool.QueryRow(ctx, metric.query).Scan(&count)
_ = statsd.Gauge(metric.name, float64(count), []string{}, 1)
2022-05-23 18:17:25 +00:00
logger.Debug("fetched metrics", zap.String("metric", metric.name), zap.Int64("count", count))
}
}
2022-05-23 18:17:25 +00:00
func enqueueUsers(ctx context.Context, logger *zap.Logger, statsd *statsd.Client, pool *pgxpool.Pool, queue rmq.Queue) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2021-10-09 14:59:20 +00:00
now := time.Now()
2022-03-28 21:05:01 +00:00
next := now.Add(domain.NotificationCheckInterval)
ids := []int64{}
2021-10-09 14:59:20 +00:00
2021-10-17 16:04:09 +00:00
defer func() {
tags := []string{"queue:users"}
_ = statsd.Histogram("apollo.queue.enqueued", float64(len(ids)), tags, 1)
_ = statsd.Histogram("apollo.queue.runtime", float64(time.Since(now).Milliseconds()), tags, 1)
}()
2022-07-13 22:43:27 +00:00
stmt := `
UPDATE users
SET next_check_at = $2
WHERE id IN (
SELECT id
FROM users
WHERE next_check_at < $1
ORDER BY next_check_at
FOR UPDATE SKIP LOCKED
LIMIT 100
)
RETURNING users.id`
rows, err := pool.Query(ctx, stmt, now, next)
2021-10-09 14:59:20 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to fetch batch of users", zap.Error(err))
2021-10-09 14:59:20 +00:00
return
}
2022-07-13 22:43:27 +00:00
for rows.Next() {
var id int64
_ = rows.Scan(&id)
ids = append(ids, id)
}
rows.Close()
2021-10-09 14:59:20 +00:00
if len(ids) == 0 {
return
}
2022-05-23 18:17:25 +00:00
logger.Debug("enqueueing user batch", zap.Int("count", len(ids)), zap.Time("start", now))
2021-10-09 14:59:20 +00:00
batchIds := make([]string, len(ids))
for i, id := range ids {
batchIds[i] = strconv.FormatInt(id, 10)
}
if err = queue.Publish(batchIds...); err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to enqueue user batch", zap.Error(err))
2021-10-09 14:59:20 +00:00
}
}
2022-05-23 18:17:25 +00:00
func enqueueSubreddits(ctx context.Context, logger *zap.Logger, statsd *statsd.Client, pool *pgxpool.Pool, queues []rmq.Queue) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2021-09-25 16:56:01 +00:00
now := time.Now()
2022-03-28 21:05:01 +00:00
next := now.Add(domain.SubredditCheckInterval)
2021-09-25 16:56:01 +00:00
ids := []int64{}
2021-10-17 16:04:09 +00:00
defer func() {
tags := []string{"queue:subreddits"}
_ = statsd.Histogram("apollo.queue.enqueued", float64(len(ids)), tags, 1)
_ = statsd.Histogram("apollo.queue.runtime", float64(time.Since(now).Milliseconds()), tags, 1)
}()
2022-07-13 22:43:27 +00:00
stmt := `
2022-05-21 21:15:24 +00:00
UPDATE subreddits
SET next_check_at = $2
WHERE subreddits.id IN(
SELECT id
2021-09-25 16:56:01 +00:00
FROM subreddits
2022-03-28 21:05:01 +00:00
WHERE next_check_at < $1
ORDER BY next_check_at
2022-05-21 21:15:24 +00:00
FOR UPDATE SKIP LOCKED
2021-09-25 16:56:01 +00:00
LIMIT 100
)
RETURNING subreddits.id`
2022-07-13 22:43:27 +00:00
rows, err := pool.Query(ctx, stmt, now, next)
2021-09-25 16:56:01 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to fetch batch of subreddits", zap.Error(err))
2021-09-25 16:56:01 +00:00
return
}
2022-07-13 22:43:27 +00:00
for rows.Next() {
var id int64
_ = rows.Scan(&id)
ids = append(ids, id)
}
rows.Close()
2021-09-25 16:56:01 +00:00
if len(ids) == 0 {
return
}
2022-05-23 18:17:25 +00:00
logger.Debug("enqueueing subreddit batch", zap.Int("count", len(ids)), zap.Time("start", now))
2021-09-25 16:56:01 +00:00
batchIds := make([]string, len(ids))
for i, id := range ids {
batchIds[i] = strconv.FormatInt(id, 10)
}
2021-10-10 15:51:42 +00:00
for _, queue := range queues {
if err = queue.Publish(batchIds...); err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to enqueue subreddit batch", zap.Error(err))
2021-10-10 15:51:42 +00:00
}
2021-09-25 16:56:01 +00:00
}
}
2022-05-23 18:17:25 +00:00
func enqueueStuckAccounts(ctx context.Context, logger *zap.Logger, statsd *statsd.Client, pool *pgxpool.Pool, queue rmq.Queue) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2021-10-17 14:17:41 +00:00
now := time.Now()
2022-03-28 21:05:01 +00:00
next := now.Add(domain.StuckNotificationCheckInterval)
2021-10-17 14:17:41 +00:00
ids := []int64{}
2021-10-17 16:04:09 +00:00
defer func() {
tags := []string{"queue:stuck-accounts"}
_ = statsd.Histogram("apollo.queue.enqueued", float64(len(ids)), tags, 1)
_ = statsd.Histogram("apollo.queue.runtime", float64(time.Since(now).Milliseconds()), tags, 1)
}()
2022-07-13 22:43:27 +00:00
stmt := `
2022-05-21 21:15:24 +00:00
UPDATE accounts
SET next_stuck_notification_check_at = $2
WHERE accounts.id IN(
SELECT id
2021-10-17 14:17:41 +00:00
FROM accounts
2022-05-21 21:15:24 +00:00
WHERE next_stuck_notification_check_at < $1
2022-03-28 21:05:01 +00:00
ORDER BY next_stuck_notification_check_at
2022-05-21 21:15:24 +00:00
FOR UPDATE SKIP LOCKED
2021-10-17 14:17:41 +00:00
LIMIT 500
)
RETURNING accounts.id`
2022-07-13 22:43:27 +00:00
rows, err := pool.Query(ctx, stmt, now, next)
2021-10-17 14:17:41 +00:00
if err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to fetch accounts", zap.Error(err))
2021-10-17 14:17:41 +00:00
return
}
2022-07-13 22:43:27 +00:00
for rows.Next() {
var id int64
_ = rows.Scan(&id)
ids = append(ids, id)
}
rows.Close()
2021-10-17 14:17:41 +00:00
if len(ids) == 0 {
return
}
2022-05-23 18:17:25 +00:00
logger.Debug("enqueueing stuck account batch", zap.Int("count", len(ids)), zap.Time("start", now))
2021-07-09 02:15:28 +00:00
2021-10-17 14:17:41 +00:00
batchIds := make([]string, len(ids))
for i, id := range ids {
batchIds[i] = strconv.FormatInt(id, 10)
}
if err = queue.Publish(batchIds...); err != nil {
2022-05-23 18:17:25 +00:00
logger.Error("failed to enqueue stuck account batch", zap.Error(err))
2021-10-17 14:17:41 +00:00
}
}
2022-05-23 18:17:25 +00:00
func enqueueAccounts(ctx context.Context, logger *zap.Logger, statsd *statsd.Client, pool *pgxpool.Pool, redisConn *redis.Client, luaSha string, queue rmq.Queue) {
2022-10-27 00:48:22 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2021-10-17 14:17:41 +00:00
now := time.Now()
2022-03-28 21:05:01 +00:00
2022-11-01 01:30:04 +00:00
query := `
SELECT DISTINCT reddit_account_id FROM accounts
INNER JOIN devices_accounts ON devices_accounts.account_id = accounts.id
INNER JOIN devices ON devices.id = devices_accounts.device_id
2022-11-01 01:32:43 +00:00
WHERE grace_period_expires_at >= NOW()
2022-11-01 16:45:11 +00:00
AND accounts.is_deleted IS FALSE
ORDER BY reddit_account_id
2022-11-01 01:30:04 +00:00
`
2022-07-31 19:07:14 +00:00
rows, err := pool.Query(ctx, query)
if err != nil {
logger.Error("failed to fetch accounts", zap.Error(err))
return
}
defer rows.Close()
2022-07-31 19:12:31 +00:00
var ids []string
2022-07-31 19:07:14 +00:00
for rows.Next() {
2022-07-31 19:23:29 +00:00
var id string
2022-07-31 19:07:14 +00:00
_ = rows.Scan(&id)
2022-07-31 19:23:29 +00:00
ids = append(ids, id)
2022-07-31 19:07:14 +00:00
}
2023-03-16 14:01:58 +00:00
chunks := [][]string{}
2023-03-16 16:01:04 +00:00
chunkSize := int(math.Ceil(float64(len(ids)) / float64(accountEnqueueSeconds)))
2023-03-16 14:01:58 +00:00
for i := 0; i < accountEnqueueSeconds; i++ {
2023-03-16 16:01:04 +00:00
left := i * chunkSize
right := (i + 1) * chunkSize
if right > len(ids) {
right = len(ids)
}
chunks = append(chunks, ids[left:right])
2023-03-16 14:01:58 +00:00
}
2021-07-09 01:07:01 +00:00
2023-03-16 14:01:58 +00:00
_ = statsd.Histogram("apollo.queue.runtime", float64(time.Since(now).Milliseconds()), []string{"queue:notifications"}, 1)
2022-05-23 18:17:25 +00:00
2022-06-04 14:24:22 +00:00
wg := sync.WaitGroup{}
2023-03-16 14:01:58 +00:00
for i := 0; i < accountEnqueueSeconds; i++ {
2022-06-04 14:24:22 +00:00
wg.Add(1)
2023-03-16 14:01:58 +00:00
go func(ctx context.Context, offset int) {
2023-03-16 14:13:26 +00:00
defer wg.Done()
2023-03-16 16:10:04 +00:00
candidates := chunks[offset]
select {
case <-ctx.Done(): //context cancelled
case <-time.After(time.Duration(offset) * time.Second): //timeout
}
2021-07-09 03:12:50 +00:00
2023-03-16 16:10:04 +00:00
enqueued, err := redisConn.EvalSha(ctx, luaSha, []string{"locks:accounts"}, candidates).StringSlice()
2022-06-04 14:21:29 +00:00
if err != nil {
logger.Error("failed to check for locked accounts", zap.Error(err))
}
2021-07-09 03:12:50 +00:00
if len(enqueued) == 0 {
logger.Info("no viable candidates to enqueue",
zap.Int("offset", offset),
2023-03-16 16:10:04 +00:00
zap.Int("candidates", len(candidates)),
zap.Int("enqueued", len(enqueued)),
)
return
}
2023-03-16 14:01:58 +00:00
if err = queue.Publish(enqueued...); err != nil {
logger.Error("failed to enqueue account batch",
zap.Error(err),
zap.Int("offset", offset),
zap.Int("candidates", len(candidates)),
2023-03-16 14:01:58 +00:00
zap.Int("enqueued", len(enqueued)),
)
2022-06-04 14:21:29 +00:00
return
}
2023-03-16 14:01:58 +00:00
logger.Info("enqueued account batch",
zap.Int("offset", offset),
zap.Int("candidates", len(candidates)),
2023-03-16 14:01:58 +00:00
zap.Int("enqueued", len(enqueued)),
)
}(ctx, i)
2021-07-08 23:03:46 +00:00
}
2022-06-04 14:21:29 +00:00
wg.Wait()
2021-07-08 23:03:46 +00:00
}