cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
29
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 305 lines 7.5 kB view raw
1package articles 2 3import ( 4 "encoding/json" 5 "strings" 6 7 "github.com/antchfx/htmlquery" 8 "golang.org/x/net/html" 9) 10 11// MetadataExtractor implements multi-strategy metadata extraction from HTML documents. 12// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags, 13// and semantic HTML5 elements, with fallback chains for each field. 14type MetadataExtractor struct{} 15 16// NewMetadataExtractor creates a new metadata extractor. 17func NewMetadataExtractor() *MetadataExtractor { 18 return &MetadataExtractor{} 19} 20 21// ExtractMetadata extracts all available metadata from an HTML document. 22// Returns an ExtractionResult with populated metadata fields. 23func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult { 24 if doc == nil { 25 return &ExtractionResult{} 26 } 27 28 result := &ExtractionResult{} 29 30 result.Title = m.ExtractTitle(doc) 31 result.Author = m.ExtractAuthor(doc) 32 result.PublishedDate = m.ExtractPublishedDate(doc) 33 result.SiteName = m.ExtractSiteName(doc) 34 result.Language = m.ExtractLanguage(doc) 35 36 return result 37} 38 39// ExtractTitle extracts the article title using multiple strategies. 40// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag. 41func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string { 42 if doc == nil { 43 return "" 44 } 45 46 if title := m.getMetaContent(doc, "property", "og:title"); title != "" { 47 return title 48 } 49 50 if title := m.getSchemaOrgField(doc, "headline"); title != "" { 51 return title 52 } 53 54 if title := m.getSchemaOrgField(doc, "name"); title != "" { 55 return title 56 } 57 58 if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" { 59 return title 60 } 61 62 if title := m.getMetaContent(doc, "property", "article:title"); title != "" { 63 return title 64 } 65 66 if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil { 67 if title := htmlquery.InnerText(h1); title != "" { 68 return strings.TrimSpace(title) 69 } 70 } 71 72 if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil { 73 if title := htmlquery.InnerText(titleNode); title != "" { 74 return strings.TrimSpace(title) 75 } 76 } 77 78 return "" 79} 80 81// ExtractAuthor extracts the article author using multiple strategies. 82// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements. 83func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string { 84 if doc == nil { 85 return "" 86 } 87 88 if author := m.getMetaContent(doc, "property", "og:author"); author != "" { 89 return author 90 } 91 92 if author := m.getSchemaOrgField(doc, "author"); author != "" { 93 return author 94 } 95 96 if author := m.getMetaContent(doc, "property", "article:author"); author != "" { 97 return author 98 } 99 100 if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" { 101 return author 102 } 103 104 if author := m.getMetaContent(doc, "name", "author"); author != "" { 105 return author 106 } 107 108 if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil { 109 if author := htmlquery.InnerText(authorLink); author != "" { 110 return strings.TrimSpace(author) 111 } 112 } 113 114 bylineSelectors := []string{ 115 "//span[contains(@class, 'author')]", 116 "//div[contains(@class, 'author')]", 117 "//p[contains(@class, 'byline')]", 118 "//span[contains(@class, 'byline')]", 119 } 120 121 for _, selector := range bylineSelectors { 122 if node := htmlquery.FindOne(doc, selector); node != nil { 123 if author := htmlquery.InnerText(node); author != "" { 124 return strings.TrimSpace(author) 125 } 126 } 127 } 128 129 return "" 130} 131 132// ExtractPublishedDate extracts the publication date using multiple strategies. 133// Tries in order: OpenGraph, Schema.org, article:published_time, time elements. 134func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string { 135 if doc == nil { 136 return "" 137 } 138 139 if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" { 140 return date 141 } 142 143 if date := m.getSchemaOrgField(doc, "datePublished"); date != "" { 144 return date 145 } 146 147 if date := m.getSchemaOrgField(doc, "publishDate"); date != "" { 148 return date 149 } 150 151 if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" { 152 return date 153 } 154 155 if date := m.getMetaContent(doc, "name", "publication_date"); date != "" { 156 return date 157 } 158 159 if date := m.getMetaContent(doc, "name", "date"); date != "" { 160 return date 161 } 162 163 if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil { 164 for _, attr := range timeNode.Attr { 165 if attr.Key == "datetime" { 166 return attr.Val 167 } 168 } 169 } 170 171 return "" 172} 173 174// ExtractSiteName extracts the site name using multiple strategies. 175// Tries in order: OpenGraph, Schema.org, meta tags. 176func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string { 177 if doc == nil { 178 return "" 179 } 180 181 if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" { 182 return siteName 183 } 184 185 if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" { 186 return publisher 187 } 188 189 if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" { 190 return siteName 191 } 192 193 return "" 194} 195 196// ExtractLanguage extracts the document language. 197// Tries in order: html lang attribute, OpenGraph, meta tags. 198func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string { 199 if doc == nil { 200 return "" 201 } 202 203 if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil { 204 for _, attr := range htmlNode.Attr { 205 if attr.Key == "lang" { 206 return attr.Val 207 } 208 } 209 } 210 211 if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" { 212 return locale 213 } 214 215 if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" { 216 return lang 217 } 218 219 return "" 220} 221 222// getMetaContent retrieves the content attribute from a meta tag. 223// Searches for meta tags with the specified attribute name and value. 224func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string { 225 if doc == nil { 226 return "" 227 } 228 229 xpath := "//meta[@" + attrName + "='" + attrValue + "']" 230 metaNode := htmlquery.FindOne(doc, xpath) 231 232 if metaNode == nil { 233 return "" 234 } 235 236 for _, attr := range metaNode.Attr { 237 if attr.Key == "content" { 238 return strings.TrimSpace(attr.Val) 239 } 240 } 241 242 return "" 243} 244 245// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data. 246func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string { 247 if doc == nil { 248 return "" 249 } 250 251 scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']") 252 253 for _, script := range scripts { 254 if script.FirstChild == nil || script.FirstChild.Type != html.TextNode { 255 continue 256 } 257 258 var data map[string]any 259 if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil { 260 continue 261 } 262 263 context, hasContext := data["@context"] 264 typeVal, hasType := data["@type"] 265 266 if !hasContext || !hasType { 267 continue 268 } 269 270 contextStr, ok := context.(string) 271 if !ok || !strings.Contains(contextStr, "schema.org") { 272 continue 273 } 274 275 typeStr, ok := typeVal.(string) 276 if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) { 277 continue 278 } 279 280 if value, exists := data[fieldName]; exists { 281 return m.extractStringValue(value) 282 } 283 } 284 285 return "" 286} 287 288// extractStringValue extracts a string from various JSON value types. 289func (m *MetadataExtractor) extractStringValue(value any) string { 290 switch v := value.(type) { 291 case string: 292 return v 293 case map[string]any: 294 if name, exists := v["name"]; exists { 295 if nameStr, ok := name.(string); ok { 296 return nameStr 297 } 298 } 299 case []any: 300 if len(v) > 0 { 301 return m.extractStringValue(v[0]) 302 } 303 } 304 return "" 305}