this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

better crawl stats endpoint

authored by

Brian Olson and committed by
Brian Olson
e2fb048a b8d53b7f

+45 -17
+9
cmd/collectiondir/crawl.go
··· 10 10 "net/http" 11 11 "net/url" 12 12 "os" 13 + "sync/atomic" 13 14 14 15 "github.com/urfave/cli/v2" 15 16 ··· 114 115 QPS float64 115 116 Results chan<- DidCollection 116 117 Log *slog.Logger 118 + Stats *CrawlStats 119 + } 120 + 121 + type CrawlStats struct { 122 + ReposDescribed atomic.Uint32 117 123 } 118 124 119 125 // CrawlPDSRepoCollections ··· 140 146 } 141 147 for _, collection := range desc.Collections { 142 148 cr.Results <- DidCollection{Did: xr.Did, Collection: collection} 149 + } 150 + if cr.Stats != nil { 151 + cr.Stats.ReposDescribed.Add(1) 143 152 } 144 153 } 145 154 if repos.Cursor != nil {
+36 -17
cmd/collectiondir/serve.go
··· 136 136 ExepctedAuthHeader string 137 137 PerPDSCrawlQPS float64 138 138 139 - activeCrawlHosts map[string]time.Time 139 + activeCrawls map[string]activeCrawl 140 140 activeCrawlsLock sync.Mutex 141 141 142 142 shutdown chan struct{} ··· 155 155 didCollectionCounts *lru.Cache[string, int] 156 156 157 157 badwords BadwordChecker 158 + } 159 + 160 + type activeCrawl struct { 161 + start time.Time 162 + stats *CrawlStats 158 163 } 159 164 160 165 func (cs *collectionServer) run(cctx *cli.Context) error { ··· 888 893 Log: cs.log, 889 894 } 890 895 start := time.Now() 891 - ok := cs.recordCrawlStart(host, start) 896 + ok, crawlStats := cs.recordCrawlStart(host, start) 892 897 if !ok { 893 898 cs.log.Info("not crawling dup", "host", host) 899 + return 894 900 } 901 + crawler.Stats = crawlStats 895 902 cs.log.Info("crawling", "host", host) 896 903 err := crawler.CrawlPDSRepoCollections() 897 904 cs.clearActiveCrawl(host) ··· 905 912 } 906 913 907 914 // recordCrawlStart returns true if ok, false if duplicate 908 - func (cs *collectionServer) recordCrawlStart(host string, start time.Time) (ok bool) { 915 + func (cs *collectionServer) recordCrawlStart(host string, start time.Time) (ok bool, stats *CrawlStats) { 909 916 cs.activeCrawlsLock.Lock() 910 917 defer cs.activeCrawlsLock.Unlock() 911 - if cs.activeCrawlHosts == nil { 912 - cs.activeCrawlHosts = make(map[string]time.Time) 913 - cs.activeCrawlHosts[host] = start 914 - return true 918 + if cs.activeCrawls == nil { 919 + cs.activeCrawls = make(map[string]activeCrawl) 915 920 } else { 916 - _, dup := cs.activeCrawlHosts[host] 921 + _, dup := cs.activeCrawls[host] 917 922 if dup { 918 - return false 923 + return false, nil 919 924 } 920 - cs.activeCrawlHosts[host] = start 921 - return true 925 + } 926 + stats = new(CrawlStats) 927 + cs.activeCrawls[host] = activeCrawl{ 928 + start: start, 929 + stats: stats, 922 930 } 931 + return true, stats 923 932 } 924 933 925 934 func (cs *collectionServer) clearActiveCrawl(host string) { 926 935 cs.activeCrawlsLock.Lock() 927 936 defer cs.activeCrawlsLock.Unlock() 928 - if cs.activeCrawlHosts == nil { 937 + if cs.activeCrawls == nil { 929 938 return 930 939 } 931 - delete(cs.activeCrawlHosts, host) 940 + delete(cs.activeCrawls, host) 932 941 } 933 942 934 943 type CrawlStatusResponse struct { 935 - HostStarts map[string]string `json:"host_starts"` 944 + HostCrawls map[string]HostCrawl `json:"host_starts"` 945 + ServerTime string `json:"server_time"` 946 + } 947 + type HostCrawl struct { 948 + Start string `json:"start"` 949 + ReposDescribed uint32 `json:"seen"` 936 950 } 937 951 938 952 // GET /v1/crawlStatus ··· 942 956 return c.JSON(http.StatusForbidden, CrawlRequestResponse{Error: "nope"}) 943 957 } 944 958 var out CrawlStatusResponse 945 - out.HostStarts = make(map[string]string) 959 + out.HostCrawls = make(map[string]HostCrawl) 946 960 cs.activeCrawlsLock.Lock() 947 961 defer cs.activeCrawlsLock.Unlock() 948 - for host, start := range cs.activeCrawlHosts { 949 - out.HostStarts[host] = start.UTC().Format(time.RFC3339) 962 + for host, rec := range cs.activeCrawls { 963 + start := rec.start 964 + out.HostCrawls[host] = HostCrawl{ 965 + Start: start.UTC().Format(time.RFC3339Nano), 966 + ReposDescribed: rec.stats.ReposDescribed.Load(), 967 + } 950 968 } 969 + out.ServerTime = time.Now().UTC().Format(time.RFC3339Nano) 951 970 return c.JSON(http.StatusOK, out) 952 971 } 953 972