···101101 author TEXT,
102102 summary TEXT,
103103 content TEXT,
104104+ full_content TEXT,
104105 published DATETIME,
105106 updated DATETIME,
106107 fetched_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
···111112 article_id INTEGER NOT NULL REFERENCES articles(id),
112113 is_read BOOLEAN NOT NULL DEFAULT 0,
113114 read_at DATETIME,
114114- is_starred BOOLEAN NOT NULL DEFAULT 0,
115115- starred_at DATETIME,
116115 PRIMARY KEY (user_did, article_id)
117116 )`,
118117 `CREATE TABLE IF NOT EXISTS annotations (
···185184 `CREATE INDEX IF NOT EXISTS idx_articles_feed ON articles(feed_url)`,
186185 `CREATE INDEX IF NOT EXISTS idx_articles_published ON articles(published DESC)`,
187186 `CREATE INDEX IF NOT EXISTS idx_read_state_unread ON read_state(user_did, is_read) WHERE is_read = 0`,
188188- `CREATE INDEX IF NOT EXISTS idx_read_state_starred ON read_state(user_did, is_starred) WHERE is_starred = 1`,
189187 `CREATE INDEX IF NOT EXISTS idx_annotations_article ON annotations(article_url)`,
190188 `CREATE INDEX IF NOT EXISTS idx_likes_article ON likes(feed_url, article_url)`,
191189 `CREATE INDEX IF NOT EXISTS idx_likes_author ON likes(author_did)`,
+281
internal/scraper/scraper.go
···11+package scraper
22+33+import (
44+ "bytes"
55+ "context"
66+ "fmt"
77+ "io"
88+ "log/slog"
99+ "net/http"
1010+ "strings"
1111+ "time"
1212+1313+ "golang.org/x/net/html"
1414+)
1515+1616+type Scraper struct {
1717+ client *http.Client
1818+ logger *slog.Logger
1919+ archiveURL string
2020+}
2121+2222+func New(logger *slog.Logger) *Scraper {
2323+ return &Scraper{
2424+ client: &http.Client{
2525+ Timeout: 15 * time.Second,
2626+ CheckRedirect: func(req *http.Request, via []*http.Request) error {
2727+ if len(via) >= 10 {
2828+ return fmt.Errorf("too many redirects")
2929+ }
3030+ return nil
3131+ },
3232+ },
3333+ logger: logger,
3434+ archiveURL: "https://archive.is/newest/",
3535+ }
3636+}
3737+3838+func (s *Scraper) Scrape(ctx context.Context, articleURL string) (string, error) {
3939+ content, err := s.scrapeDirect(ctx, articleURL)
4040+ if err != nil {
4141+ s.logger.Warn("direct scrape failed, trying archive.is", "error", err, "url", articleURL)
4242+ return s.scrapeArchive(ctx, articleURL)
4343+ }
4444+ if content == "" {
4545+ s.logger.Warn("direct scrape returned empty content, trying archive.is", "url", articleURL)
4646+ return s.scrapeArchive(ctx, articleURL)
4747+ }
4848+ return content, nil
4949+}
5050+5151+func (s *Scraper) scrapeDirect(ctx context.Context, articleURL string) (string, error) {
5252+ body, err := s.fetch(ctx, articleURL)
5353+ if err != nil {
5454+ return "", fmt.Errorf("fetching article: %w", err)
5555+ }
5656+ return extractContent(body)
5757+}
5858+5959+func (s *Scraper) scrapeArchive(ctx context.Context, articleURL string) (string, error) {
6060+ archiveURL := s.archiveURL + articleURL
6161+ body, err := s.fetch(ctx, archiveURL)
6262+ if err != nil {
6363+ return "", fmt.Errorf("fetching from archive.is: %w", err)
6464+ }
6565+ return extractContent(body)
6666+}
6767+6868+func (s *Scraper) fetch(ctx context.Context, url string) (io.Reader, error) {
6969+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
7070+ if err != nil {
7171+ return nil, err
7272+ }
7373+ req.Header.Set("User-Agent", "Glean/1.0 (RSS Reader)")
7474+7575+ resp, err := s.client.Do(req)
7676+ if err != nil {
7777+ return nil, err
7878+ }
7979+ defer resp.Body.Close()
8080+8181+ if resp.StatusCode != http.StatusOK {
8282+ return nil, fmt.Errorf("unexpected status: %d", resp.StatusCode)
8383+ }
8484+8585+ data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
8686+ if err != nil {
8787+ return nil, fmt.Errorf("reading body: %w", err)
8888+ }
8989+ return bytes.NewReader(data), nil
9090+}
9191+9292+func extractContent(r io.Reader) (string, error) {
9393+ doc, err := html.Parse(r)
9494+ if err != nil {
9595+ return "", fmt.Errorf("parsing HTML: %w", err)
9696+ }
9797+9898+ removeUnwanted(doc)
9999+100100+ if content := findElement(doc, "article"); content != nil {
101101+ return renderNode(content), nil
102102+ }
103103+104104+ if content := findElementByRole(doc, "main"); content != nil {
105105+ return renderNode(content), nil
106106+ }
107107+108108+ if content := findElement(doc, "main"); content != nil {
109109+ return renderNode(content), nil
110110+ }
111111+112112+ if content := findLargestTextNode(doc); content != nil {
113113+ return renderNode(content), nil
114114+ }
115115+116116+ return "", nil
117117+}
118118+119119+func removeUnwanted(n *html.Node) {
120120+ var remove []*html.Node
121121+ for c := n.FirstChild; c != nil; c = c.NextSibling {
122122+ if isUnwanted(c) {
123123+ remove = append(remove, c)
124124+ continue
125125+ }
126126+ removeUnwanted(c)
127127+ }
128128+ for _, c := range remove {
129129+ n.RemoveChild(c)
130130+ }
131131+}
132132+133133+func isUnwanted(n *html.Node) bool {
134134+ if n.Type != html.ElementNode {
135135+ return false
136136+ }
137137+ switch n.Data {
138138+ case "script", "style", "nav", "header", "footer", "aside",
139139+ "noscript", "iframe", "form", "svg", "button", "input",
140140+ "textarea", "select":
141141+ return true
142142+ }
143143+ for _, attr := range n.Attr {
144144+ if attr.Key == "class" {
145145+ cls := strings.ToLower(attr.Val)
146146+ if strings.Contains(cls, "comment") ||
147147+ strings.Contains(cls, "sidebar") ||
148148+ strings.Contains(cls, "advertisement") ||
149149+ strings.Contains(cls, "ad-banner") ||
150150+ strings.Contains(cls, "social-share") ||
151151+ strings.Contains(cls, "newsletter") ||
152152+ strings.Contains(cls, "popup") ||
153153+ strings.Contains(cls, "cookie") ||
154154+ strings.Contains(cls, "paywall") {
155155+ return true
156156+ }
157157+ }
158158+ if attr.Key == "id" {
159159+ id := strings.ToLower(attr.Val)
160160+ if strings.Contains(id, "comment") ||
161161+ strings.Contains(id, "sidebar") ||
162162+ strings.Contains(id, "footer") ||
163163+ strings.Contains(id, "header") ||
164164+ strings.Contains(id, "nav") {
165165+ return true
166166+ }
167167+ }
168168+ }
169169+ return false
170170+}
171171+172172+func findElement(n *html.Node, tag string) *html.Node {
173173+ if n.Type == html.ElementNode && n.Data == tag {
174174+ return n
175175+ }
176176+ for c := n.FirstChild; c != nil; c = c.NextSibling {
177177+ if found := findElement(c, tag); found != nil {
178178+ return found
179179+ }
180180+ }
181181+ return nil
182182+}
183183+184184+func findElementByRole(n *html.Node, role string) *html.Node {
185185+ if n.Type == html.ElementNode {
186186+ for _, attr := range n.Attr {
187187+ if attr.Key == "role" && attr.Val == role {
188188+ return n
189189+ }
190190+ }
191191+ }
192192+ for c := n.FirstChild; c != nil; c = c.NextSibling {
193193+ if found := findElementByRole(c, role); found != nil {
194194+ return found
195195+ }
196196+ }
197197+ return nil
198198+}
199199+200200+func findLargestTextNode(root *html.Node) *html.Node {
201201+ var best *html.Node
202202+ bestLen := 0
203203+204204+ var walk func(*html.Node)
205205+ walk = func(n *html.Node) {
206206+ if n.Type == html.ElementNode {
207207+ switch n.Data {
208208+ case "div", "section", "td":
209209+ textLen := textLength(n)
210210+ if textLen > bestLen && textLen > 200 {
211211+ best = n
212212+ bestLen = textLen
213213+ }
214214+ }
215215+ }
216216+ for c := n.FirstChild; c != nil; c = c.NextSibling {
217217+ walk(c)
218218+ }
219219+ }
220220+ walk(root)
221221+ return best
222222+}
223223+224224+func textLength(n *html.Node) int {
225225+ total := 0
226226+ var walk func(*html.Node)
227227+ walk = func(c *html.Node) {
228228+ if c.Type == html.TextNode {
229229+ total += len(strings.TrimSpace(c.Data))
230230+ }
231231+ for child := c.FirstChild; child != nil; child = child.NextSibling {
232232+ walk(child)
233233+ }
234234+ }
235235+ walk(n)
236236+ return total
237237+}
238238+239239+func renderNode(n *html.Node) string {
240240+ var buf strings.Builder
241241+ var write func(*html.Node)
242242+ write = func(node *html.Node) {
243243+ if node.Type == html.TextNode {
244244+ buf.WriteString(node.Data)
245245+ return
246246+ }
247247+ if node.Type == html.ElementNode {
248248+ buf.WriteString("<")
249249+ buf.WriteString(node.Data)
250250+ for _, attr := range node.Attr {
251251+ buf.WriteString(" ")
252252+ buf.WriteString(attr.Key)
253253+ buf.WriteString(`="`)
254254+ buf.WriteString(html.EscapeString(attr.Val))
255255+ buf.WriteString(`"`)
256256+ }
257257+ buf.WriteString(">")
258258+ }
259259+ for c := node.FirstChild; c != nil; c = c.NextSibling {
260260+ write(c)
261261+ }
262262+ if node.Type == html.ElementNode && !isVoidElement(node.Data) {
263263+ buf.WriteString("</")
264264+ buf.WriteString(node.Data)
265265+ buf.WriteString(">")
266266+ }
267267+ }
268268+ for c := n.FirstChild; c != nil; c = c.NextSibling {
269269+ write(c)
270270+ }
271271+ return strings.TrimSpace(buf.String())
272272+}
273273+274274+func isVoidElement(tag string) bool {
275275+ switch tag {
276276+ case "br", "hr", "img", "input", "meta", "link", "area",
277277+ "base", "col", "embed", "source", "track", "wbr":
278278+ return true
279279+ }
280280+ return false
281281+}
+144
internal/scraper/scraper_test.go
···11+package scraper
22+33+import (
44+ "log/slog"
55+ "net/http"
66+ "net/http/httptest"
77+ "strings"
88+ "testing"
99+1010+ "gotest.tools/v3/assert"
1111+)
1212+1313+func TestExtractContent_ArticleElement(t *testing.T) {
1414+ html := `<!DOCTYPE html><html><body>
1515+ <nav>navigation</nav>
1616+ <article><p>This is the article content with enough text to be meaningful and substantial for the reader to consume.</p></article>
1717+ <footer>footer</footer>
1818+ </body></html>`
1919+2020+ content, err := extractContent(strings.NewReader(html))
2121+ assert.NilError(t, err)
2222+ assert.Assert(t, strings.Contains(content, "This is the article content"))
2323+ assert.Assert(t, !strings.Contains(content, "navigation"))
2424+ assert.Assert(t, !strings.Contains(content, "footer"))
2525+}
2626+2727+func TestExtractContent_MainElement(t *testing.T) {
2828+ html := `<!DOCTYPE html><html><body>
2929+ <nav>navigation</nav>
3030+ <main><p>Main content area with sufficient text to be considered a proper article body for reading purposes.</p></main>
3131+ </body></html>`
3232+3333+ content, err := extractContent(strings.NewReader(html))
3434+ assert.NilError(t, err)
3535+ assert.Assert(t, strings.Contains(content, "Main content area"))
3636+}
3737+3838+func TestExtractContent_RoleMain(t *testing.T) {
3939+ html := `<!DOCTYPE html><html><body>
4040+ <div role="main"><p>Content in a role=main div with enough text to be useful for the reader to enjoy.</p></div>
4141+ </body></html>`
4242+4343+ content, err := extractContent(strings.NewReader(html))
4444+ assert.NilError(t, err)
4545+ assert.Assert(t, strings.Contains(content, "Content in a role=main div"))
4646+}
4747+4848+func TestExtractContent_LargestDiv(t *testing.T) {
4949+ html := `<!DOCTYPE html><html><body>
5050+ <div class="sidebar">small sidebar text</div>
5151+ <div class="content">
5252+ <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>
5353+ </div>
5454+ </body></html>`
5555+5656+ content, err := extractContent(strings.NewReader(html))
5757+ assert.NilError(t, err)
5858+ assert.Assert(t, strings.Contains(content, "Lorem ipsum"))
5959+}
6060+6161+func TestExtractContent_RemovesScripts(t *testing.T) {
6262+ html := `<!DOCTYPE html><html><body>
6363+ <article>
6464+ <p>Good content here that is long enough to pass the minimum threshold for content extraction logic.</p>
6565+ <script>alert('xss')</script>
6666+ <style>.foo { color: red; }</style>
6767+ </article>
6868+ </body></html>`
6969+7070+ content, err := extractContent(strings.NewReader(html))
7171+ assert.NilError(t, err)
7272+ assert.Assert(t, strings.Contains(content, "Good content"))
7373+ assert.Assert(t, !strings.Contains(content, "alert"))
7474+ assert.Assert(t, !strings.Contains(content, "color: red"))
7575+}
7676+7777+func TestExtractContent_EmptyBody(t *testing.T) {
7878+ html := `<!DOCTYPE html><html><body></body></html>`
7979+8080+ content, err := extractContent(strings.NewReader(html))
8181+ assert.NilError(t, err)
8282+ assert.Equal(t, content, "")
8383+}
8484+8585+func TestScrapeDirect_Integration(t *testing.T) {
8686+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
8787+ w.Header().Set("Content-Type", "text/html")
8888+ _, _ = w.Write([]byte(`<!DOCTYPE html><html><body>
8989+ <article><h1>Title</h1><p>Full article content that has enough substance to be extracted as the primary content of this webpage.</p></article>
9090+ </body></html>`))
9191+ }))
9292+ defer ts.Close()
9393+9494+ s := New(slog.Default())
9595+ content, err := s.scrapeDirect(t.Context(), ts.URL+"/article")
9696+ assert.NilError(t, err)
9797+ assert.Assert(t, strings.Contains(content, "Full article content"))
9898+}
9999+100100+func TestScrapeDirect_NonHTML(t *testing.T) {
101101+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
102102+ w.WriteHeader(http.StatusForbidden)
103103+ }))
104104+ defer ts.Close()
105105+106106+ s := New(slog.Default())
107107+ _, err := s.scrapeDirect(t.Context(), ts.URL+"/doc.pdf")
108108+ assert.Assert(t, err != nil)
109109+}
110110+111111+func TestScrape_FallsBackToArchive(t *testing.T) {
112112+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
113113+ if r.URL.Path == "/paywall-article" {
114114+ w.WriteHeader(http.StatusForbidden)
115115+ return
116116+ }
117117+ w.Header().Set("Content-Type", "text/html")
118118+ _, _ = w.Write([]byte(`<!DOCTYPE html><html><body>
119119+ <article><p>Archived content successfully retrieved from the archive service for reading.</p></article>
120120+ </body></html>`))
121121+ }))
122122+ defer ts.Close()
123123+124124+ s := New(slog.Default())
125125+ s.archiveURL = ts.URL + "/archive?q="
126126+127127+ content, err := s.Scrape(t.Context(), ts.URL+"/paywall-article")
128128+ assert.NilError(t, err)
129129+ assert.Assert(t, strings.Contains(content, "Archived content"))
130130+}
131131+132132+func TestRenderNode_VoidElements(t *testing.T) {
133133+ html := `<!DOCTYPE html><html><body>
134134+ <article>
135135+ <p>Text with <br>break and <img src="test.jpg"> image</p>
136136+ </article>
137137+ </body></html>`
138138+139139+ content, err := extractContent(strings.NewReader(html))
140140+ assert.NilError(t, err)
141141+ assert.Assert(t, strings.Contains(content, "<br>"))
142142+ assert.Assert(t, strings.Contains(content, "<img"))
143143+ assert.Assert(t, !strings.Contains(content, "</br>"))
144144+}