A social RSS reader built on the AT Protocol. glean.at
glean atproto atmosphere rss feed social app
14
fork

Configure Feed

Select the types of activity you want to include in your feed.

Add article content fetching and ux improvements

+580 -82
+4 -3
go.mod
··· 8 8 github.com/go-chi/cors v1.2.2 9 9 github.com/gorilla/websocket v1.5.3 10 10 github.com/mattn/go-sqlite3 v1.14.22 11 + github.com/prometheus/client_golang v1.17.0 12 + golang.org/x/net v0.53.0 11 13 gotest.tools/v3 v3.5.2 12 14 ) 13 15 ··· 21 23 github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect 22 24 github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect 23 25 github.com/mr-tron/base58 v1.2.0 // indirect 24 - github.com/prometheus/client_golang v1.17.0 // indirect 25 26 github.com/prometheus/client_model v0.5.0 // indirect 26 27 github.com/prometheus/common v0.45.0 // indirect 27 28 github.com/prometheus/procfs v0.12.0 // indirect 28 29 gitlab.com/yawning/secp256k1-voi v0.0.0-20230925100816-f2616030848b // indirect 29 30 gitlab.com/yawning/tuplehash v0.0.0-20230713102510-df83abbf9a02 // indirect 30 - golang.org/x/crypto v0.21.0 // indirect 31 - golang.org/x/sys v0.22.0 // indirect 31 + golang.org/x/crypto v0.50.0 // indirect 32 + golang.org/x/sys v0.43.0 // indirect 32 33 golang.org/x/time v0.3.0 // indirect 33 34 google.golang.org/protobuf v1.33.0 // indirect 34 35 )
+6 -4
go.sum
··· 65 65 gitlab.com/yawning/secp256k1-voi v0.0.0-20230925100816-f2616030848b/go.mod h1:/y/V339mxv2sZmYYR64O07VuCpdNZqCTwO8ZcouTMI8= 66 66 gitlab.com/yawning/tuplehash v0.0.0-20230713102510-df83abbf9a02 h1:qwDnMxjkyLmAFgcfgTnfJrmYKWhHnci3GjDqcZp1M3Q= 67 67 gitlab.com/yawning/tuplehash v0.0.0-20230713102510-df83abbf9a02/go.mod h1:JTnUj0mpYiAsuZLmKjTx/ex3AtMowcCgnE7YNyCEP0I= 68 - golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= 69 - golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= 70 - golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= 71 - golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 68 + golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= 69 + golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= 70 + golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= 71 + golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= 72 + golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= 73 + golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= 72 74 golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= 73 75 golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= 74 76 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+31 -72
internal/db/article.go
··· 7 7 ) 8 8 9 9 type Article struct { 10 - ID int64 11 - FeedURL string 12 - FeedTitle string 13 - GUID string 14 - Title string 15 - URL sql.NullString 16 - Author sql.NullString 17 - Summary sql.NullString 18 - Content sql.NullString 19 - Published sql.NullTime 20 - Updated sql.NullTime 21 - FetchedAt sql.NullTime 22 - IsRead sql.NullBool 23 - IsStarred sql.NullBool 10 + ID int64 11 + FeedURL string 12 + FeedTitle string 13 + GUID string 14 + Title string 15 + URL sql.NullString 16 + Author sql.NullString 17 + Summary sql.NullString 18 + Content sql.NullString 19 + FullContent sql.NullString 20 + Published sql.NullTime 21 + Updated sql.NullTime 22 + FetchedAt sql.NullTime 23 + IsRead sql.NullBool 24 24 } 25 25 26 26 type ReadState struct { ··· 28 28 ArticleID int64 29 29 IsRead bool 30 30 ReadAt sql.NullTime 31 - IsStarred bool 32 - StarredAt sql.NullTime 33 31 } 34 32 35 33 func (db *DB) UpsertArticle(ctx context.Context, article *Article) (int64, error) { ··· 52 50 func (db *DB) GetArticle(ctx context.Context, id int64) (*Article, error) { 53 51 a := &Article{} 54 52 err := db.QueryRowContext(ctx, ` 55 - SELECT id, feed_url, guid, title, url, author, summary, content, published, updated, fetched_at 53 + SELECT id, feed_url, guid, title, url, author, summary, content, full_content, published, updated, fetched_at 56 54 FROM articles WHERE id = ? 57 55 `, id).Scan(&a.ID, &a.FeedURL, &a.GUID, &a.Title, &a.URL, &a.Author, 58 - &a.Summary, &a.Content, &a.Published, &a.Updated, &a.FetchedAt) 56 + &a.Summary, &a.Content, &a.FullContent, &a.Published, &a.Updated, &a.FetchedAt) 59 57 if err != nil { 60 58 return nil, err 61 59 } ··· 66 64 query := ` 67 65 SELECT a.id, a.feed_url, COALESCE(f.title, ''), a.guid, a.title, a.url, a.author, a.summary, a.content, 68 66 a.published, a.updated, a.fetched_at, 69 - COALESCE(r.is_read, 0), COALESCE(r.is_starred, 0) 67 + COALESCE(r.is_read, 0) 70 68 FROM articles a 71 69 JOIN subscriptions s ON a.feed_url = s.feed_url AND s.user_did = ? 72 70 LEFT JOIN feeds f ON a.feed_url = f.feed_url ··· 92 90 a := &Article{} 93 91 if err := rows.Scan(&a.ID, &a.FeedURL, &a.FeedTitle, &a.GUID, &a.Title, &a.URL, &a.Author, 94 92 &a.Summary, &a.Content, &a.Published, &a.Updated, &a.FetchedAt, 95 - &a.IsRead, &a.IsStarred); err != nil { 93 + &a.IsRead); err != nil { 96 94 return nil, err 97 95 } 98 96 articles = append(articles, a) ··· 104 102 query := ` 105 103 SELECT a.id, a.feed_url, COALESCE(f.title, ''), a.guid, a.title, a.url, a.author, a.summary, a.content, 106 104 a.published, a.updated, a.fetched_at, 107 - COALESCE(r.is_read, 0), COALESCE(r.is_starred, 0) 105 + COALESCE(r.is_read, 0) 108 106 FROM articles a 109 107 JOIN subscriptions s ON a.feed_url = s.feed_url AND s.user_did = ? 110 108 LEFT JOIN feeds f ON a.feed_url = f.feed_url ··· 131 129 a := &Article{} 132 130 if err := rows.Scan(&a.ID, &a.FeedURL, &a.FeedTitle, &a.GUID, &a.Title, &a.URL, &a.Author, 133 131 &a.Summary, &a.Content, &a.Published, &a.Updated, &a.FetchedAt, 134 - &a.IsRead, &a.IsStarred); err != nil { 132 + &a.IsRead); err != nil { 135 133 return nil, err 136 134 } 137 135 articles = append(articles, a) ··· 143 141 query := ` 144 142 SELECT a.id, a.feed_url, COALESCE(f.title, ''), a.guid, a.title, a.url, a.author, a.summary, a.content, 145 143 a.published, a.updated, a.fetched_at, 146 - COALESCE(r.is_read, 0), COALESCE(r.is_starred, 0) 144 + COALESCE(r.is_read, 0) 147 145 FROM articles a 148 146 JOIN subscriptions s ON a.feed_url = s.feed_url AND s.user_did = ? 149 147 LEFT JOIN feeds f ON a.feed_url = f.feed_url ··· 170 168 a := &Article{} 171 169 if err := rows.Scan(&a.ID, &a.FeedURL, &a.FeedTitle, &a.GUID, &a.Title, &a.URL, &a.Author, 172 170 &a.Summary, &a.Content, &a.Published, &a.Updated, &a.FetchedAt, 173 - &a.IsRead, &a.IsStarred); err != nil { 174 - return nil, err 175 - } 176 - articles = append(articles, a) 177 - } 178 - return articles, rows.Err() 179 - } 180 - 181 - func (db *DB) ListStarredArticles(ctx context.Context, userDID string, limit, offset int) ([]*Article, error) { 182 - rows, err := db.QueryContext(ctx, fmt.Sprintf(` 183 - SELECT a.id, a.feed_url, COALESCE(f.title, ''), a.guid, a.title, a.url, a.author, a.summary, a.content, 184 - a.published, a.updated, a.fetched_at 185 - FROM articles a 186 - JOIN read_state r ON r.article_id = a.id AND r.user_did = ? 187 - LEFT JOIN feeds f ON a.feed_url = f.feed_url 188 - WHERE r.is_starred = 1 189 - ORDER BY r.starred_at DESC 190 - LIMIT %d OFFSET %d 191 - `, limit, offset), userDID) 192 - if err != nil { 193 - return nil, err 194 - } 195 - defer rows.Close() 196 - 197 - var articles []*Article 198 - for rows.Next() { 199 - a := &Article{} 200 - if err := rows.Scan(&a.ID, &a.FeedURL, &a.FeedTitle, &a.GUID, &a.Title, &a.URL, &a.Author, 201 - &a.Summary, &a.Content, &a.Published, &a.Updated, &a.FetchedAt); err != nil { 171 + &a.IsRead); err != nil { 202 172 return nil, err 203 173 } 204 174 articles = append(articles, a) ··· 251 221 return err 252 222 } 253 223 254 - func (db *DB) StarArticle(ctx context.Context, userDID string, articleID int64) error { 255 - _, err := db.ExecContext(ctx, ` 256 - INSERT INTO read_state (user_did, article_id, is_starred, starred_at) 257 - VALUES (?, ?, 1, CURRENT_TIMESTAMP) 258 - ON CONFLICT(user_did, article_id) DO UPDATE SET 259 - is_starred = 1, starred_at = CURRENT_TIMESTAMP 260 - `, userDID, articleID) 261 - return err 262 - } 263 - 264 - func (db *DB) UnstarArticle(ctx context.Context, userDID string, articleID int64) error { 265 - _, err := db.ExecContext(ctx, ` 266 - UPDATE read_state SET is_starred = 0, starred_at = NULL 267 - WHERE user_did = ? AND article_id = ? 268 - `, userDID, articleID) 269 - return err 270 - } 271 - 272 224 func (db *DB) GetReadState(ctx context.Context, userDID string, articleID int64) (*ReadState, error) { 273 225 rs := &ReadState{} 274 226 err := db.QueryRowContext(ctx, ` 275 - SELECT user_did, article_id, is_read, read_at, is_starred, starred_at 227 + SELECT user_did, article_id, is_read, read_at 276 228 FROM read_state WHERE user_did = ? AND article_id = ? 277 - `, userDID, articleID).Scan(&rs.UserDID, &rs.ArticleID, &rs.IsRead, &rs.ReadAt, &rs.IsStarred, &rs.StarredAt) 229 + `, userDID, articleID).Scan(&rs.UserDID, &rs.ArticleID, &rs.IsRead, &rs.ReadAt) 278 230 if err == sql.ErrNoRows { 279 231 return &ReadState{UserDID: userDID, ArticleID: articleID}, nil 280 232 } ··· 305 257 `, userDID, userDID).Scan(&count) 306 258 return count, err 307 259 } 260 + 261 + func (db *DB) UpdateArticleFullContent(ctx context.Context, id int64, fullContent string) error { 262 + _, err := db.ExecContext(ctx, ` 263 + UPDATE articles SET full_content = ? WHERE id = ? 264 + `, fullContent, id) 265 + return err 266 + }
+24
internal/db/article_test.go
··· 161 161 assert.NilError(t, err) 162 162 assert.Equal(t, count, 1) 163 163 } 164 + 165 + func TestUpdateArticleFullContent(t *testing.T) { 166 + ctx := context.Background() 167 + db := setupTestDB(t) 168 + _, _, _, articleID := seedArticleReadState(t, ctx, db) 169 + 170 + err := db.UpdateArticleFullContent(ctx, articleID, "<p>Scraped content</p>") 171 + assert.NilError(t, err) 172 + 173 + article, err := db.GetArticle(ctx, articleID) 174 + assert.NilError(t, err) 175 + assert.Equal(t, article.FullContent.String, "<p>Scraped content</p>") 176 + assert.Assert(t, article.FullContent.Valid) 177 + } 178 + 179 + func TestGetArticle_IncludesFullContent(t *testing.T) { 180 + ctx := context.Background() 181 + db := setupTestDB(t) 182 + _, _, _, articleID := seedArticleReadState(t, ctx, db) 183 + 184 + article, err := db.GetArticle(ctx, articleID) 185 + assert.NilError(t, err) 186 + assert.Assert(t, !article.FullContent.Valid) 187 + }
+1 -3
internal/db/db.go
··· 101 101 author TEXT, 102 102 summary TEXT, 103 103 content TEXT, 104 + full_content TEXT, 104 105 published DATETIME, 105 106 updated DATETIME, 106 107 fetched_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, ··· 111 112 article_id INTEGER NOT NULL REFERENCES articles(id), 112 113 is_read BOOLEAN NOT NULL DEFAULT 0, 113 114 read_at DATETIME, 114 - is_starred BOOLEAN NOT NULL DEFAULT 0, 115 - starred_at DATETIME, 116 115 PRIMARY KEY (user_did, article_id) 117 116 )`, 118 117 `CREATE TABLE IF NOT EXISTS annotations ( ··· 185 184 `CREATE INDEX IF NOT EXISTS idx_articles_feed ON articles(feed_url)`, 186 185 `CREATE INDEX IF NOT EXISTS idx_articles_published ON articles(published DESC)`, 187 186 `CREATE INDEX IF NOT EXISTS idx_read_state_unread ON read_state(user_did, is_read) WHERE is_read = 0`, 188 - `CREATE INDEX IF NOT EXISTS idx_read_state_starred ON read_state(user_did, is_starred) WHERE is_starred = 1`, 189 187 `CREATE INDEX IF NOT EXISTS idx_annotations_article ON annotations(article_url)`, 190 188 `CREATE INDEX IF NOT EXISTS idx_likes_article ON likes(feed_url, article_url)`, 191 189 `CREATE INDEX IF NOT EXISTS idx_likes_author ON likes(author_did)`,
+281
internal/scraper/scraper.go
··· 1 + package scraper 2 + 3 + import ( 4 + "bytes" 5 + "context" 6 + "fmt" 7 + "io" 8 + "log/slog" 9 + "net/http" 10 + "strings" 11 + "time" 12 + 13 + "golang.org/x/net/html" 14 + ) 15 + 16 + type Scraper struct { 17 + client *http.Client 18 + logger *slog.Logger 19 + archiveURL string 20 + } 21 + 22 + func New(logger *slog.Logger) *Scraper { 23 + return &Scraper{ 24 + client: &http.Client{ 25 + Timeout: 15 * time.Second, 26 + CheckRedirect: func(req *http.Request, via []*http.Request) error { 27 + if len(via) >= 10 { 28 + return fmt.Errorf("too many redirects") 29 + } 30 + return nil 31 + }, 32 + }, 33 + logger: logger, 34 + archiveURL: "https://archive.is/newest/", 35 + } 36 + } 37 + 38 + func (s *Scraper) Scrape(ctx context.Context, articleURL string) (string, error) { 39 + content, err := s.scrapeDirect(ctx, articleURL) 40 + if err != nil { 41 + s.logger.Warn("direct scrape failed, trying archive.is", "error", err, "url", articleURL) 42 + return s.scrapeArchive(ctx, articleURL) 43 + } 44 + if content == "" { 45 + s.logger.Warn("direct scrape returned empty content, trying archive.is", "url", articleURL) 46 + return s.scrapeArchive(ctx, articleURL) 47 + } 48 + return content, nil 49 + } 50 + 51 + func (s *Scraper) scrapeDirect(ctx context.Context, articleURL string) (string, error) { 52 + body, err := s.fetch(ctx, articleURL) 53 + if err != nil { 54 + return "", fmt.Errorf("fetching article: %w", err) 55 + } 56 + return extractContent(body) 57 + } 58 + 59 + func (s *Scraper) scrapeArchive(ctx context.Context, articleURL string) (string, error) { 60 + archiveURL := s.archiveURL + articleURL 61 + body, err := s.fetch(ctx, archiveURL) 62 + if err != nil { 63 + return "", fmt.Errorf("fetching from archive.is: %w", err) 64 + } 65 + return extractContent(body) 66 + } 67 + 68 + func (s *Scraper) fetch(ctx context.Context, url string) (io.Reader, error) { 69 + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 70 + if err != nil { 71 + return nil, err 72 + } 73 + req.Header.Set("User-Agent", "Glean/1.0 (RSS Reader)") 74 + 75 + resp, err := s.client.Do(req) 76 + if err != nil { 77 + return nil, err 78 + } 79 + defer resp.Body.Close() 80 + 81 + if resp.StatusCode != http.StatusOK { 82 + return nil, fmt.Errorf("unexpected status: %d", resp.StatusCode) 83 + } 84 + 85 + data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024)) 86 + if err != nil { 87 + return nil, fmt.Errorf("reading body: %w", err) 88 + } 89 + return bytes.NewReader(data), nil 90 + } 91 + 92 + func extractContent(r io.Reader) (string, error) { 93 + doc, err := html.Parse(r) 94 + if err != nil { 95 + return "", fmt.Errorf("parsing HTML: %w", err) 96 + } 97 + 98 + removeUnwanted(doc) 99 + 100 + if content := findElement(doc, "article"); content != nil { 101 + return renderNode(content), nil 102 + } 103 + 104 + if content := findElementByRole(doc, "main"); content != nil { 105 + return renderNode(content), nil 106 + } 107 + 108 + if content := findElement(doc, "main"); content != nil { 109 + return renderNode(content), nil 110 + } 111 + 112 + if content := findLargestTextNode(doc); content != nil { 113 + return renderNode(content), nil 114 + } 115 + 116 + return "", nil 117 + } 118 + 119 + func removeUnwanted(n *html.Node) { 120 + var remove []*html.Node 121 + for c := n.FirstChild; c != nil; c = c.NextSibling { 122 + if isUnwanted(c) { 123 + remove = append(remove, c) 124 + continue 125 + } 126 + removeUnwanted(c) 127 + } 128 + for _, c := range remove { 129 + n.RemoveChild(c) 130 + } 131 + } 132 + 133 + func isUnwanted(n *html.Node) bool { 134 + if n.Type != html.ElementNode { 135 + return false 136 + } 137 + switch n.Data { 138 + case "script", "style", "nav", "header", "footer", "aside", 139 + "noscript", "iframe", "form", "svg", "button", "input", 140 + "textarea", "select": 141 + return true 142 + } 143 + for _, attr := range n.Attr { 144 + if attr.Key == "class" { 145 + cls := strings.ToLower(attr.Val) 146 + if strings.Contains(cls, "comment") || 147 + strings.Contains(cls, "sidebar") || 148 + strings.Contains(cls, "advertisement") || 149 + strings.Contains(cls, "ad-banner") || 150 + strings.Contains(cls, "social-share") || 151 + strings.Contains(cls, "newsletter") || 152 + strings.Contains(cls, "popup") || 153 + strings.Contains(cls, "cookie") || 154 + strings.Contains(cls, "paywall") { 155 + return true 156 + } 157 + } 158 + if attr.Key == "id" { 159 + id := strings.ToLower(attr.Val) 160 + if strings.Contains(id, "comment") || 161 + strings.Contains(id, "sidebar") || 162 + strings.Contains(id, "footer") || 163 + strings.Contains(id, "header") || 164 + strings.Contains(id, "nav") { 165 + return true 166 + } 167 + } 168 + } 169 + return false 170 + } 171 + 172 + func findElement(n *html.Node, tag string) *html.Node { 173 + if n.Type == html.ElementNode && n.Data == tag { 174 + return n 175 + } 176 + for c := n.FirstChild; c != nil; c = c.NextSibling { 177 + if found := findElement(c, tag); found != nil { 178 + return found 179 + } 180 + } 181 + return nil 182 + } 183 + 184 + func findElementByRole(n *html.Node, role string) *html.Node { 185 + if n.Type == html.ElementNode { 186 + for _, attr := range n.Attr { 187 + if attr.Key == "role" && attr.Val == role { 188 + return n 189 + } 190 + } 191 + } 192 + for c := n.FirstChild; c != nil; c = c.NextSibling { 193 + if found := findElementByRole(c, role); found != nil { 194 + return found 195 + } 196 + } 197 + return nil 198 + } 199 + 200 + func findLargestTextNode(root *html.Node) *html.Node { 201 + var best *html.Node 202 + bestLen := 0 203 + 204 + var walk func(*html.Node) 205 + walk = func(n *html.Node) { 206 + if n.Type == html.ElementNode { 207 + switch n.Data { 208 + case "div", "section", "td": 209 + textLen := textLength(n) 210 + if textLen > bestLen && textLen > 200 { 211 + best = n 212 + bestLen = textLen 213 + } 214 + } 215 + } 216 + for c := n.FirstChild; c != nil; c = c.NextSibling { 217 + walk(c) 218 + } 219 + } 220 + walk(root) 221 + return best 222 + } 223 + 224 + func textLength(n *html.Node) int { 225 + total := 0 226 + var walk func(*html.Node) 227 + walk = func(c *html.Node) { 228 + if c.Type == html.TextNode { 229 + total += len(strings.TrimSpace(c.Data)) 230 + } 231 + for child := c.FirstChild; child != nil; child = child.NextSibling { 232 + walk(child) 233 + } 234 + } 235 + walk(n) 236 + return total 237 + } 238 + 239 + func renderNode(n *html.Node) string { 240 + var buf strings.Builder 241 + var write func(*html.Node) 242 + write = func(node *html.Node) { 243 + if node.Type == html.TextNode { 244 + buf.WriteString(node.Data) 245 + return 246 + } 247 + if node.Type == html.ElementNode { 248 + buf.WriteString("<") 249 + buf.WriteString(node.Data) 250 + for _, attr := range node.Attr { 251 + buf.WriteString(" ") 252 + buf.WriteString(attr.Key) 253 + buf.WriteString(`="`) 254 + buf.WriteString(html.EscapeString(attr.Val)) 255 + buf.WriteString(`"`) 256 + } 257 + buf.WriteString(">") 258 + } 259 + for c := node.FirstChild; c != nil; c = c.NextSibling { 260 + write(c) 261 + } 262 + if node.Type == html.ElementNode && !isVoidElement(node.Data) { 263 + buf.WriteString("</") 264 + buf.WriteString(node.Data) 265 + buf.WriteString(">") 266 + } 267 + } 268 + for c := n.FirstChild; c != nil; c = c.NextSibling { 269 + write(c) 270 + } 271 + return strings.TrimSpace(buf.String()) 272 + } 273 + 274 + func isVoidElement(tag string) bool { 275 + switch tag { 276 + case "br", "hr", "img", "input", "meta", "link", "area", 277 + "base", "col", "embed", "source", "track", "wbr": 278 + return true 279 + } 280 + return false 281 + }
+144
internal/scraper/scraper_test.go
··· 1 + package scraper 2 + 3 + import ( 4 + "log/slog" 5 + "net/http" 6 + "net/http/httptest" 7 + "strings" 8 + "testing" 9 + 10 + "gotest.tools/v3/assert" 11 + ) 12 + 13 + func TestExtractContent_ArticleElement(t *testing.T) { 14 + html := `<!DOCTYPE html><html><body> 15 + <nav>navigation</nav> 16 + <article><p>This is the article content with enough text to be meaningful and substantial for the reader to consume.</p></article> 17 + <footer>footer</footer> 18 + </body></html>` 19 + 20 + content, err := extractContent(strings.NewReader(html)) 21 + assert.NilError(t, err) 22 + assert.Assert(t, strings.Contains(content, "This is the article content")) 23 + assert.Assert(t, !strings.Contains(content, "navigation")) 24 + assert.Assert(t, !strings.Contains(content, "footer")) 25 + } 26 + 27 + func TestExtractContent_MainElement(t *testing.T) { 28 + html := `<!DOCTYPE html><html><body> 29 + <nav>navigation</nav> 30 + <main><p>Main content area with sufficient text to be considered a proper article body for reading purposes.</p></main> 31 + </body></html>` 32 + 33 + content, err := extractContent(strings.NewReader(html)) 34 + assert.NilError(t, err) 35 + assert.Assert(t, strings.Contains(content, "Main content area")) 36 + } 37 + 38 + func TestExtractContent_RoleMain(t *testing.T) { 39 + html := `<!DOCTYPE html><html><body> 40 + <div role="main"><p>Content in a role=main div with enough text to be useful for the reader to enjoy.</p></div> 41 + </body></html>` 42 + 43 + content, err := extractContent(strings.NewReader(html)) 44 + assert.NilError(t, err) 45 + assert.Assert(t, strings.Contains(content, "Content in a role=main div")) 46 + } 47 + 48 + func TestExtractContent_LargestDiv(t *testing.T) { 49 + html := `<!DOCTYPE html><html><body> 50 + <div class="sidebar">small sidebar text</div> 51 + <div class="content"> 52 + <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p> 53 + </div> 54 + </body></html>` 55 + 56 + content, err := extractContent(strings.NewReader(html)) 57 + assert.NilError(t, err) 58 + assert.Assert(t, strings.Contains(content, "Lorem ipsum")) 59 + } 60 + 61 + func TestExtractContent_RemovesScripts(t *testing.T) { 62 + html := `<!DOCTYPE html><html><body> 63 + <article> 64 + <p>Good content here that is long enough to pass the minimum threshold for content extraction logic.</p> 65 + <script>alert('xss')</script> 66 + <style>.foo { color: red; }</style> 67 + </article> 68 + </body></html>` 69 + 70 + content, err := extractContent(strings.NewReader(html)) 71 + assert.NilError(t, err) 72 + assert.Assert(t, strings.Contains(content, "Good content")) 73 + assert.Assert(t, !strings.Contains(content, "alert")) 74 + assert.Assert(t, !strings.Contains(content, "color: red")) 75 + } 76 + 77 + func TestExtractContent_EmptyBody(t *testing.T) { 78 + html := `<!DOCTYPE html><html><body></body></html>` 79 + 80 + content, err := extractContent(strings.NewReader(html)) 81 + assert.NilError(t, err) 82 + assert.Equal(t, content, "") 83 + } 84 + 85 + func TestScrapeDirect_Integration(t *testing.T) { 86 + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 87 + w.Header().Set("Content-Type", "text/html") 88 + _, _ = w.Write([]byte(`<!DOCTYPE html><html><body> 89 + <article><h1>Title</h1><p>Full article content that has enough substance to be extracted as the primary content of this webpage.</p></article> 90 + </body></html>`)) 91 + })) 92 + defer ts.Close() 93 + 94 + s := New(slog.Default()) 95 + content, err := s.scrapeDirect(t.Context(), ts.URL+"/article") 96 + assert.NilError(t, err) 97 + assert.Assert(t, strings.Contains(content, "Full article content")) 98 + } 99 + 100 + func TestScrapeDirect_NonHTML(t *testing.T) { 101 + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 102 + w.WriteHeader(http.StatusForbidden) 103 + })) 104 + defer ts.Close() 105 + 106 + s := New(slog.Default()) 107 + _, err := s.scrapeDirect(t.Context(), ts.URL+"/doc.pdf") 108 + assert.Assert(t, err != nil) 109 + } 110 + 111 + func TestScrape_FallsBackToArchive(t *testing.T) { 112 + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 113 + if r.URL.Path == "/paywall-article" { 114 + w.WriteHeader(http.StatusForbidden) 115 + return 116 + } 117 + w.Header().Set("Content-Type", "text/html") 118 + _, _ = w.Write([]byte(`<!DOCTYPE html><html><body> 119 + <article><p>Archived content successfully retrieved from the archive service for reading.</p></article> 120 + </body></html>`)) 121 + })) 122 + defer ts.Close() 123 + 124 + s := New(slog.Default()) 125 + s.archiveURL = ts.URL + "/archive?q=" 126 + 127 + content, err := s.Scrape(t.Context(), ts.URL+"/paywall-article") 128 + assert.NilError(t, err) 129 + assert.Assert(t, strings.Contains(content, "Archived content")) 130 + } 131 + 132 + func TestRenderNode_VoidElements(t *testing.T) { 133 + html := `<!DOCTYPE html><html><body> 134 + <article> 135 + <p>Text with <br>break and <img src="test.jpg"> image</p> 136 + </article> 137 + </body></html>` 138 + 139 + content, err := extractContent(strings.NewReader(html)) 140 + assert.NilError(t, err) 141 + assert.Assert(t, strings.Contains(content, "<br>")) 142 + assert.Assert(t, strings.Contains(content, "<img")) 143 + assert.Assert(t, !strings.Contains(content, "</br>")) 144 + }
+45
internal/server/articles_handler.go
··· 11 11 12 12 "pkg.rbrt.fr/glean/internal/atproto" 13 13 "pkg.rbrt.fr/glean/internal/db" 14 + "pkg.rbrt.fr/glean/internal/sanitize" 14 15 ) 15 16 16 17 func writeLikeButton(w http.ResponseWriter, articleID int64, liked bool, count int) { ··· 238 239 w.Header().Set("HX-Refresh", "true") 239 240 w.WriteHeader(http.StatusNoContent) 240 241 } 242 + 243 + func (s *Server) handleFetchContent(w http.ResponseWriter, r *http.Request) { 244 + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) 245 + if err != nil { 246 + http.Error(w, "invalid id", http.StatusBadRequest) 247 + return 248 + } 249 + 250 + article, err := s.db.GetArticle(r.Context(), id) 251 + if err != nil { 252 + http.Error(w, "article not found", http.StatusNotFound) 253 + return 254 + } 255 + 256 + if !article.URL.Valid { 257 + s.logger.Warn("cannot fetch content: article has no URL", "id", id) 258 + http.Error(w, "article has no URL", http.StatusBadRequest) 259 + return 260 + } 261 + 262 + content, err := s.scraper.Scrape(r.Context(), article.URL.String) 263 + if err != nil { 264 + s.logger.Error("failed to scrape article", "error", err, "url", article.URL.String) 265 + w.Header().Set("Content-Type", "text/html") 266 + _, _ = fmt.Fprintf(w, `<div id="article-content" class="text-spot-secondary text-sm">Failed to fetch content. <button hx-post="/articles/%d/fetch-content" hx-target="#article-content" hx-swap="outerHTML" class="text-spot-green underline">Retry</button></div>`, id) 267 + return 268 + } 269 + 270 + if content == "" { 271 + w.Header().Set("Content-Type", "text/html") 272 + _, _ = fmt.Fprintf(w, `<div id="article-content" class="text-spot-secondary text-sm">No readable content found. <a href="%s" target="_blank" rel="noopener noreferrer" class="text-spot-green underline">Read on original site</a></div>`, article.URL.String) 273 + return 274 + } 275 + 276 + cleaned := sanitize.HTML(content) 277 + 278 + if err := s.db.UpdateArticleFullContent(r.Context(), id, cleaned); err != nil { 279 + s.logger.Error("failed to save full content", "error", err, "id", id) 280 + } 281 + 282 + w.Header().Set("Content-Type", "text/html") 283 + _, _ = fmt.Fprintf(w, `<div id="article-content" class="article-body">%s</div>`, cleaned) 284 + s.logger.Info("scraped article content", "id", id, "url", article.URL.String, "content_len", len(cleaned)) 285 + }
+23
internal/server/server.go
··· 26 26 "pkg.rbrt.fr/glean/internal/feed" 27 27 "pkg.rbrt.fr/glean/internal/metrics" 28 28 "pkg.rbrt.fr/glean/internal/sanitize" 29 + "pkg.rbrt.fr/glean/internal/scraper" 29 30 ) 30 31 31 32 func splitString(s, sep string) []string { ··· 41 42 oauthStore *db.OAuthStore 42 43 fetcher *feed.Fetcher 43 44 scheduler *feed.Scheduler 45 + scraper *scraper.Scraper 44 46 clientID string 45 47 callbackURL string 46 48 } ··· 69 71 oauthStore: oauthStore, 70 72 fetcher: feed.NewFetcher(), 71 73 scheduler: scheduler, 74 + scraper: scraper.New(logger), 72 75 clientID: clientID, 73 76 callbackURL: callbackURL, 74 77 } ··· 144 147 r.Post("/{id}/read", s.handleMarkRead) 145 148 r.Post("/{id}/unread", s.handleMarkUnread) 146 149 r.Post("/{id}/like", s.handleLikeArticle) 150 + r.Post("/{id}/fetch-content", s.handleFetchContent) 147 151 r.Post("/mark-all-read", s.handleMarkAllRead) 148 152 }) 149 153 ··· 263 267 } 264 268 } 265 269 return "" 270 + }, 271 + "isEmbedURL": func(rawURL string) bool { 272 + u, err := url.Parse(rawURL) 273 + if err != nil { 274 + return false 275 + } 276 + host := strings.ToLower(u.Hostname()) 277 + for _, h := range []string{ 278 + "www.youtube.com", "youtube.com", "m.youtube.com", "youtu.be", 279 + "vimeo.com", "player.vimeo.com", 280 + "open.spotify.com", "embed.spotify.com", 281 + "w.soundcloud.com", 282 + "bandcamp.com", 283 + } { 284 + if host == h { 285 + return true 286 + } 287 + } 288 + return false 266 289 }, 267 290 "sanitizeHTML": func(input string) template.HTML { 268 291 return template.HTML(sanitize.HTML(input))
+21
internal/tmpl/article_detail.html
··· 61 61 {{end}} 62 62 {{end}} 63 63 64 + {{if or .Article.Content.Valid .Article.FullContent.Valid .Article.Summary.Valid}} 64 65 <hr class="my-8 border-spot-divider"> 66 + {{end}} 65 67 66 68 {{if .Article.Content.Valid}} 67 69 <div class="article-body"> 68 70 {{sanitizeHTML .Article.Content.String}} 69 71 </div> 72 + {{else if .Article.FullContent.Valid}} 73 + <div class="article-body"> 74 + {{sanitizeHTML .Article.FullContent.String}} 75 + </div> 70 76 {{else if .Article.Summary.Valid}} 71 77 <div class="article-body"> 72 78 {{sanitizeHTML .Article.Summary.String}} 79 + </div> 80 + {{end}} 81 + 82 + {{if and (not .Article.Content.Valid) (not .Article.FullContent.Valid) .Article.URL.Valid (not (isEmbedURL .Article.URL.String))}} 83 + <div id="article-content" class="mt-6"> 84 + <button hx-post="/articles/{{.Article.ID}}/fetch-content" hx-target="#article-content" hx-swap="outerHTML" 85 + class="border border-spot-outline text-spot-text rounded-pill px-2.5 py-1.5 text-xs font-bold uppercase tracking-button hover:border-spot-text transition inline-flex items-center gap-1.5"> 86 + <svg class="w-3.5 h-3.5" fill="none" stroke="currentColor" stroke-width="1.5" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5M16.5 12L12 16.5m0 0L7.5 12m4.5 4.5V3"/></svg> 87 + Fetch full content 88 + </button> 73 89 </div> 74 90 {{else if not .Article.URL.Valid}} 75 91 <p class="text-spot-secondary">No content available.</p> ··· 110 126 {{end}} 111 127 </div> 112 128 </section> 129 + 130 + <a href="/articles" class="text-sm text-spot-secondary hover:text-spot-text mt-8 inline-flex items-center gap-1.5 transition"> 131 + <svg class="w-4 h-4" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" d="M15.75 19.5L8.25 12l7.5-7.5"/></svg> 132 + Back to articles 133 + </a> 113 134 </div> 114 135 115 136 <script>