apollo-backend/internal/cmd/scheduler.go

461 lines
11 KiB
Go
Raw Permalink Normal View History

package cmd
2021-07-08 23:03:46 +00:00
import (
"context"
2021-07-09 03:12:50 +00:00
"encoding/json"
2021-07-08 23:03:46 +00:00
"fmt"
"strconv"
2021-07-08 23:03:46 +00:00
"time"
2021-07-09 02:09:14 +00:00
"github.com/DataDog/datadog-go/statsd"
2021-07-08 23:03:46 +00:00
"github.com/adjust/rmq/v4"
"github.com/christianselig/apollo-backend/internal/cmdutil"
2021-08-14 15:54:48 +00:00
"github.com/christianselig/apollo-backend/internal/repository"
2021-07-08 23:03:46 +00:00
"github.com/go-co-op/gocron"
"github.com/go-redis/redis/v8"
"github.com/jackc/pgx/v4"
"github.com/jackc/pgx/v4/pgxpool"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
2021-07-08 23:03:46 +00:00
)
2021-07-09 03:12:50 +00:00
const (
2021-09-25 16:56:01 +00:00
batchSize = 250
checkTimeout = 60 // how long until we force a check
accountEnqueueTimeout = 5 // how frequently we want to check (seconds)
2021-10-09 14:59:20 +00:00
subredditEnqueueTimeout = 2 * 60 // how frequently we want to check (seconds)
userEnqueueTimeout = 2 * 60 // how frequently we want to check (seconds)
2021-08-14 16:08:17 +00:00
2021-09-27 19:54:29 +00:00
staleAccountThreshold = 7200 // 2 hours
2021-07-09 03:12:50 +00:00
)
func SchedulerCmd(ctx context.Context) *cobra.Command {
cmd := &cobra.Command{
Use: "scheduler",
Args: cobra.ExactArgs(0),
Short: "Schedules jobs and runs several maintenance tasks periodically.",
RunE: func(cmd *cobra.Command, args []string) error {
logger := cmdutil.NewLogrusLogger(false)
statsd, err := cmdutil.NewStatsdClient()
if err != nil {
return err
}
defer statsd.Close()
2021-07-20 17:00:53 +00:00
db, err := cmdutil.NewDatabasePool(ctx, 1)
if err != nil {
return err
}
defer db.Close()
redis, err := cmdutil.NewRedisClient(ctx)
if err != nil {
return err
}
defer redis.Close()
queue, err := cmdutil.NewQueueClient(logger, redis, "worker")
if err != nil {
return err
}
// Eval lua so that we don't keep parsing it
luaSha, err := evalScript(ctx, redis)
if err != nil {
return err
}
notifQueue, err := queue.OpenQueue("notifications")
if err != nil {
return err
}
2021-09-25 16:56:01 +00:00
subredditQueue, err := queue.OpenQueue("subreddits")
if err != nil {
return err
}
2021-10-10 15:51:42 +00:00
trendingQueue, err := queue.OpenQueue("trending")
if err != nil {
return err
}
2021-10-09 14:59:20 +00:00
userQueue, err := queue.OpenQueue("users")
if err != nil {
return err
}
s := gocron.NewScheduler(time.UTC)
2021-09-25 13:19:42 +00:00
_, _ = s.Every(200).Milliseconds().SingletonMode().Do(func() { enqueueAccounts(ctx, logger, statsd, db, redis, luaSha, notifQueue) })
2021-10-10 15:51:42 +00:00
_, _ = s.Every(200).Milliseconds().SingletonMode().Do(func() { enqueueSubreddits(ctx, logger, statsd, db, []rmq.Queue{subredditQueue, trendingQueue}) })
2021-10-09 14:59:20 +00:00
_, _ = s.Every(200).Milliseconds().SingletonMode().Do(func() { enqueueUsers(ctx, logger, statsd, db, userQueue) })
2021-09-25 13:19:42 +00:00
_, _ = s.Every(1).Second().Do(func() { cleanQueues(ctx, logger, queue) })
_, _ = s.Every(1).Minute().Do(func() { reportStats(ctx, logger, statsd, db, redis) })
_, _ = s.Every(1).Minute().Do(func() { pruneAccounts(ctx, logger, db) })
_, _ = s.Every(1).Minute().Do(func() { pruneDevices(ctx, logger, db) })
s.StartAsync()
<-ctx.Done()
s.Stop()
return nil
},
2021-07-08 23:03:46 +00:00
}
return cmd
2021-07-08 23:03:46 +00:00
}
func evalScript(ctx context.Context, redis *redis.Client) (string, error) {
lua := fmt.Sprintf(`
local retv={}
local ids=cjson.decode(ARGV[1])
for i=1, #ids do
local key = KEYS[1] .. ":" .. ids[i]
if redis.call("exists", key) == 0 then
redis.call("setex", key, %d, 1)
retv[#retv + 1] = ids[i]
end
end
return retv
`, checkTimeout)
return redis.ScriptLoad(ctx, lua).Result()
}
2021-08-14 15:59:13 +00:00
func pruneAccounts(ctx context.Context, logger *logrus.Logger, pool *pgxpool.Pool) {
2021-08-14 16:08:17 +00:00
before := time.Now().Unix() - staleAccountThreshold
2021-08-14 15:54:48 +00:00
ar := repository.NewPostgresAccount(pool)
2021-08-14 16:08:17 +00:00
stale, err := ar.PruneStale(ctx, before)
2021-08-14 15:54:48 +00:00
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed cleaning stale accounts")
return
}
2021-08-14 15:59:13 +00:00
orphaned, err := ar.PruneOrphaned(ctx)
2021-07-12 19:36:22 +00:00
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
2021-08-14 15:59:13 +00:00
}).Error("failed cleaning orphaned accounts")
2021-07-12 19:36:22 +00:00
return
}
2021-08-14 15:59:13 +00:00
count := stale + orphaned
2021-07-23 00:22:46 +00:00
if count > 0 {
logger.WithFields(logrus.Fields{
"count": count,
2021-08-14 15:59:13 +00:00
}).Info("pruned accounts")
2021-07-23 00:22:46 +00:00
}
2021-07-12 19:36:22 +00:00
}
2021-08-14 16:08:17 +00:00
func pruneDevices(ctx context.Context, logger *logrus.Logger, pool *pgxpool.Pool) {
2021-09-25 18:56:14 +00:00
threshold := time.Now().Unix()
2021-08-14 16:08:17 +00:00
dr := repository.NewPostgresDevice(pool)
2021-09-25 18:56:14 +00:00
count, err := dr.PruneStale(ctx, threshold)
2021-08-14 16:08:17 +00:00
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed cleaning stale devices")
return
}
if count > 0 {
logger.WithFields(logrus.Fields{
"count": count,
}).Info("pruned devices")
}
}
2021-07-12 19:36:22 +00:00
func cleanQueues(ctx context.Context, logger *logrus.Logger, jobsConn rmq.Connection) {
cleaner := rmq.NewCleaner(jobsConn)
count, err := cleaner.Clean()
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed cleaning jobs from queues")
return
}
logger.WithFields(logrus.Fields{
"count": count,
}).Debug("returned jobs to queues")
}
func reportStats(ctx context.Context, logger *logrus.Logger, statsd *statsd.Client, pool *pgxpool.Pool, redisConn *redis.Client) {
var (
count int64
metrics = []struct {
query string
name string
}{
{"SELECT COUNT(*) FROM accounts", "apollo.registrations.accounts"},
{"SELECT COUNT(*) FROM devices", "apollo.registrations.devices"},
}
)
for _, metric := range metrics {
2021-09-25 13:19:42 +00:00
_ = pool.QueryRow(ctx, metric.query).Scan(&count)
_ = statsd.Gauge(metric.name, float64(count), []string{}, 1)
logger.WithFields(logrus.Fields{
"count": count,
"metric": metric.name,
}).Debug("fetched metrics")
}
}
2021-10-09 14:59:20 +00:00
func enqueueUsers(ctx context.Context, logger *logrus.Logger, statsd *statsd.Client, pool *pgxpool.Pool, queue rmq.Queue) {
now := time.Now()
ready := now.Unix() - userEnqueueTimeout
ids := []int64{}
err := pool.BeginFunc(ctx, func(tx pgx.Tx) error {
stmt := `
WITH userb AS (
SELECT id
FROM users
WHERE last_checked_at < $1
ORDER BY last_checked_at
LIMIT 100
)
UPDATE users
SET last_checked_at = $2
WHERE users.id IN(SELECT id FROM userb)
RETURNING users.id`
rows, err := tx.Query(ctx, stmt, ready, now.Unix())
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var id int64
_ = rows.Scan(&id)
ids = append(ids, id)
}
return nil
})
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed to fetch batch of users")
return
}
if len(ids) == 0 {
return
}
logger.WithFields(logrus.Fields{
"count": len(ids),
"start": ready,
}).Debug("enqueueing user batch")
batchIds := make([]string, len(ids))
for i, id := range ids {
batchIds[i] = strconv.FormatInt(id, 10)
}
if err = queue.Publish(batchIds...); err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed to enqueue user")
}
_ = statsd.Histogram("apollo.queue.users.enqueued", float64(len(ids)), []string{}, 1)
_ = statsd.Histogram("apollo.queue.users.runtime", float64(time.Since(now).Milliseconds()), []string{}, 1)
}
2021-10-10 15:51:42 +00:00
func enqueueSubreddits(ctx context.Context, logger *logrus.Logger, statsd *statsd.Client, pool *pgxpool.Pool, queues []rmq.Queue) {
2021-09-25 16:56:01 +00:00
now := time.Now()
ready := now.Unix() - subredditEnqueueTimeout
ids := []int64{}
err := pool.BeginFunc(ctx, func(tx pgx.Tx) error {
stmt := `
WITH subreddit AS (
SELECT id
FROM subreddits
WHERE last_checked_at < $1
ORDER BY last_checked_at
LIMIT 100
)
UPDATE subreddits
SET last_checked_at = $2
WHERE subreddits.id IN(SELECT id FROM subreddit)
RETURNING subreddits.id`
rows, err := tx.Query(ctx, stmt, ready, now.Unix())
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var id int64
_ = rows.Scan(&id)
ids = append(ids, id)
}
return nil
})
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed to fetch batch of subreddits")
return
}
if len(ids) == 0 {
return
}
logger.WithFields(logrus.Fields{
"count": len(ids),
"start": ready,
}).Debug("enqueueing subreddit batch")
batchIds := make([]string, len(ids))
for i, id := range ids {
batchIds[i] = strconv.FormatInt(id, 10)
}
2021-10-10 15:51:42 +00:00
for _, queue := range queues {
if err = queue.Publish(batchIds...); err != nil {
logger.WithFields(logrus.Fields{
"queue": queue,
"err": err,
}).Error("failed to enqueue subreddit")
}
2021-09-25 16:56:01 +00:00
}
_ = statsd.Histogram("apollo.queue.subreddits.enqueued", float64(len(ids)), []string{}, 1)
_ = statsd.Histogram("apollo.queue.subreddits.runtime", float64(time.Since(now).Milliseconds()), []string{}, 1)
}
func enqueueAccounts(ctx context.Context, logger *logrus.Logger, statsd *statsd.Client, pool *pgxpool.Pool, redisConn *redis.Client, luaSha string, queue rmq.Queue) {
2021-07-09 02:15:28 +00:00
start := time.Now()
2021-07-09 01:07:01 +00:00
now := float64(time.Now().UnixNano()/int64(time.Millisecond)) / 1000
2021-07-09 02:09:14 +00:00
// Start looking for accounts that were last checked at least 5 seconds ago
// and at most 6 seconds ago. Also look for accounts that haven't been checked
// in over a minute.
2021-07-09 02:15:28 +00:00
ts := start.Unix()
2021-09-25 16:56:01 +00:00
ready := ts - accountEnqueueTimeout
2021-07-09 04:27:50 +00:00
expired := ts - checkTimeout
2021-07-09 00:53:09 +00:00
2021-07-08 23:03:46 +00:00
ids := []int64{}
err := pool.BeginFunc(ctx, func(tx pgx.Tx) error {
stmt := `
WITH account AS (
SELECT id
FROM accounts
2021-07-09 01:01:45 +00:00
WHERE
2021-07-09 04:27:50 +00:00
last_enqueued_at < $1
OR last_checked_at < $2
ORDER BY last_checked_at
2021-07-23 01:52:29 +00:00
LIMIT 2500
2021-07-08 23:03:46 +00:00
)
UPDATE accounts
2021-07-09 04:27:50 +00:00
SET last_enqueued_at = $3
2021-07-08 23:03:46 +00:00
WHERE accounts.id IN(SELECT id FROM account)
RETURNING accounts.id`
2021-07-09 04:27:50 +00:00
rows, err := tx.Query(ctx, stmt, ready, expired, now)
2021-07-08 23:03:46 +00:00
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var id int64
2021-09-25 13:19:42 +00:00
_ = rows.Scan(&id)
2021-07-08 23:03:46 +00:00
ids = append(ids, id)
}
return nil
})
if err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed to fetch batch of accounts")
return
}
logger.WithFields(logrus.Fields{
"count": len(ids),
2021-07-09 04:27:50 +00:00
"start": ready,
2021-07-08 23:03:46 +00:00
}).Debug("enqueueing account batch")
2021-07-09 00:17:20 +00:00
enqueued := 0
2021-07-09 00:26:01 +00:00
skipped := 0
2021-07-09 03:12:50 +00:00
// Split ids in batches
for i := 0; i < len(ids); i += batchSize {
j := i + batchSize
if j > len(ids) {
j = len(ids)
2021-07-08 23:03:46 +00:00
}
2021-07-09 03:12:50 +00:00
batch := Int64Slice(ids[i:j])
logger.WithFields(logrus.Fields{
"len": len(batch),
}).Debug("enqueueing batch")
res, err := redisConn.EvalSha(ctx, luaSha, []string{"locks:accounts"}, batch).Result()
2021-07-09 03:12:50 +00:00
if err != nil {
2021-07-09 00:26:01 +00:00
logger.WithFields(logrus.Fields{
2021-07-09 03:12:50 +00:00
"err": err,
}).Error("failed to check for locked accounts")
2021-07-09 00:26:01 +00:00
}
2021-07-09 03:12:50 +00:00
vals := res.([]interface{})
skipped += len(batch) - len(vals)
enqueued += len(vals)
2021-07-09 03:12:50 +00:00
if len(vals) == 0 {
continue
}
batchIds := make([]string, len(vals))
for k, v := range vals {
batchIds[k] = strconv.FormatInt(v.(int64), 10)
}
if err = queue.Publish(batchIds...); err != nil {
logger.WithFields(logrus.Fields{
"err": err,
}).Error("failed to enqueue account")
}
2021-07-08 23:03:46 +00:00
}
2021-09-25 16:56:01 +00:00
_ = statsd.Histogram("apollo.queue.notifications.enqueued", float64(enqueued), []string{}, 1)
_ = statsd.Histogram("apollo.queue.notifications.skipped", float64(skipped), []string{}, 1)
_ = statsd.Histogram("apollo.queue.notifications.runtime", float64(time.Since(start).Milliseconds()), []string{}, 1)
2021-07-09 02:09:14 +00:00
2021-07-08 23:03:46 +00:00
logger.WithFields(logrus.Fields{
2021-07-09 00:26:01 +00:00
"count": enqueued,
"skipped": skipped,
2021-07-09 04:27:50 +00:00
"start": ready,
2021-07-09 06:00:57 +00:00
}).Debug("done enqueueing account batch")
2021-07-08 23:03:46 +00:00
}
2021-07-09 03:12:50 +00:00
type Int64Slice []int64
func (ii Int64Slice) MarshalBinary() (data []byte, err error) {
bytes, err := json.Marshal(ii)
return bytes, err
}