nice clean recipes pear.dunkirk.sh
recipes
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: improve parser to add generic html parser and unescape html

+411 -27
+228
internal/extract/generic/generic.go
··· 1 + package generic 2 + 3 + import ( 4 + "strings" 5 + 6 + "tangled.org/dunkirk.sh/pear/internal/extract/schema" 7 + "tangled.org/dunkirk.sh/pear/internal/models" 8 + 9 + "golang.org/x/net/html" 10 + ) 11 + 12 + func Extract(body string) (*models.Recipe, bool) { 13 + doc, err := html.Parse(strings.NewReader(body)) 14 + if err != nil { 15 + return nil, false 16 + } 17 + 18 + recipe := &models.Recipe{} 19 + recipe.ExtractionMethod = "generic" 20 + 21 + found := false 22 + 23 + if name := findByClass(doc, "recipe-title", "recipe-name"); name != "" { 24 + recipe.Name = name 25 + found = true 26 + } 27 + if desc := findByMetaContent(doc, "description"); desc != "" { 28 + recipe.Description = desc 29 + } 30 + if img := findByItempropImage(doc); img != "" { 31 + recipe.ImageURL = img 32 + } else if img := findByMetaContent(doc, "og:image"); img != "" { 33 + recipe.ImageURL = img 34 + } 35 + if yield := findByClass(doc, "serves"); yield != "" { 36 + recipe.Yield = strings.TrimPrefix(yield, "Serves ") 37 + } 38 + 39 + ingredients := collectIngredients(doc) 40 + if len(ingredients) > 0 { 41 + found = true 42 + for _, ing := range ingredients { 43 + recipe.Ingredients = append(recipe.Ingredients, schema.ParseIngredient(ing)) 44 + } 45 + } 46 + 47 + instructions := collectInstructions(doc) 48 + if len(instructions) > 0 { 49 + found = true 50 + for _, instr := range instructions { 51 + recipe.Instructions = append(recipe.Instructions, models.Instruction{Text: instr}) 52 + } 53 + } 54 + 55 + if !found { 56 + return nil, false 57 + } 58 + 59 + if recipe.Name == "" { 60 + if title := findByMetaContent(doc, "og:title"); title != "" { 61 + recipe.Name = title 62 + } 63 + } 64 + 65 + if recipe.Name == "" { 66 + return nil, false 67 + } 68 + 69 + return recipe, true 70 + } 71 + 72 + func findByClass(n *html.Node, classes ...string) string { 73 + var result string 74 + var f func(*html.Node) 75 + f = func(n *html.Node) { 76 + if n.Type == html.ElementNode { 77 + for _, class := range classes { 78 + if hasClass(n, class) { 79 + result = textContent(n) 80 + return 81 + } 82 + } 83 + } 84 + for c := n.FirstChild; c != nil; c = c.NextSibling { 85 + f(c) 86 + if result != "" { 87 + return 88 + } 89 + } 90 + } 91 + f(n) 92 + return result 93 + } 94 + 95 + func hasClass(n *html.Node, class string) bool { 96 + for _, attr := range n.Attr { 97 + if attr.Key == "class" { 98 + for _, c := range strings.Fields(attr.Val) { 99 + if c == class { 100 + return true 101 + } 102 + } 103 + } 104 + } 105 + return false 106 + } 107 + 108 + func findByMetaContent(n *html.Node, name string) string { 109 + var f func(*html.Node) string 110 + f = func(n *html.Node) string { 111 + if n.Type == html.ElementNode && n.Data == "meta" { 112 + metaName, metaProp, metaContent := "", "", "" 113 + for _, attr := range n.Attr { 114 + switch attr.Key { 115 + case "name": 116 + metaName = attr.Val 117 + case "property": 118 + metaProp = attr.Val 119 + case "content": 120 + metaContent = attr.Val 121 + } 122 + } 123 + if (metaName == name || metaProp == name) && metaContent != "" { 124 + return metaContent 125 + } 126 + } 127 + for c := n.FirstChild; c != nil; c = c.NextSibling { 128 + if v := f(c); v != "" { 129 + return v 130 + } 131 + } 132 + return "" 133 + } 134 + return f(n) 135 + } 136 + 137 + func findByItempropImage(n *html.Node) string { 138 + var f func(*html.Node) string 139 + f = func(n *html.Node) string { 140 + if n.Type == html.ElementNode { 141 + hasImageProp := false 142 + for _, attr := range n.Attr { 143 + if attr.Key == "itemprop" && attr.Val == "image" { 144 + hasImageProp = true 145 + break 146 + } 147 + } 148 + if hasImageProp && n.Data == "img" { 149 + for _, attr := range n.Attr { 150 + if attr.Key == "src" { 151 + return attr.Val 152 + } 153 + } 154 + } 155 + } 156 + for c := n.FirstChild; c != nil; c = c.NextSibling { 157 + if v := f(c); v != "" { 158 + return v 159 + } 160 + } 161 + return "" 162 + } 163 + return f(n) 164 + } 165 + 166 + func collectIngredients(n *html.Node) []string { 167 + container := findNodeByClass(n, "ingredients", "recipe-ingredients") 168 + if container == nil { 169 + return nil 170 + } 171 + var items []string 172 + collectParagraphsAndListItems(container, &items) 173 + return items 174 + } 175 + 176 + func collectInstructions(n *html.Node) []string { 177 + container := findNodeByClass(n, "directions", "instructions", "recipe-instructions", "recipe-directions") 178 + if container == nil { 179 + return nil 180 + } 181 + var items []string 182 + collectParagraphsAndListItems(container, &items) 183 + return items 184 + } 185 + 186 + func findNodeByClass(n *html.Node, classes ...string) *html.Node { 187 + var f func(*html.Node) *html.Node 188 + f = func(n *html.Node) *html.Node { 189 + if n.Type == html.ElementNode { 190 + for _, class := range classes { 191 + if hasClass(n, class) { 192 + return n 193 + } 194 + } 195 + } 196 + for c := n.FirstChild; c != nil; c = c.NextSibling { 197 + if found := f(c); found != nil { 198 + return found 199 + } 200 + } 201 + return nil 202 + } 203 + return f(n) 204 + } 205 + 206 + func collectParagraphsAndListItems(n *html.Node, items *[]string) { 207 + for c := n.FirstChild; c != nil; c = c.NextSibling { 208 + if c.Type == html.ElementNode && (c.Data == "p" || c.Data == "li") { 209 + text := strings.TrimSpace(textContent(c)) 210 + if text != "" { 211 + *items = append(*items, text) 212 + } 213 + } else { 214 + collectParagraphsAndListItems(c, items) 215 + } 216 + } 217 + } 218 + 219 + func textContent(n *html.Node) string { 220 + if n.Type == html.TextNode { 221 + return n.Data 222 + } 223 + var sb strings.Builder 224 + for c := n.FirstChild; c != nil; c = c.NextSibling { 225 + sb.WriteString(textContent(c)) 226 + } 227 + return strings.TrimSpace(sb.String()) 228 + }
+58
internal/extract/generic/generic_test.go
··· 1 + package generic 2 + 3 + import ( 4 + "testing" 5 + ) 6 + 7 + const testHTML = ` 8 + <html><body> 9 + <img itemprop="image" src="https://example.com/photo.jpg" alt="Smoothie"/> 10 + <main class="recipe-content"> 11 + <h1 class="recipe-title">Creamy Ginger Green Smoothie</h1> 12 + <h4 class="serves">Serves 1</h4> 13 + <div class="ingredients"> 14 + <h3>Ingredients:</h3> 15 + <p>2 handfuls organic <a href="/spinach/">spinach</a></p> 16 + <p>1 cup filtered water</p> 17 + <p>1/2 avocado</p> 18 + <p>1 medium banana</p> 19 + </div> 20 + <div class="directions"> 21 + <h3>Directions:</h3> 22 + <div class="instructions"> 23 + <p>Simply add all ingredients in a high-speed blender and blend until thick and creamy.</p> 24 + <p>You may add ice if you'd like to chill further.</p> 25 + <p>Enjoy!</p> 26 + </div> 27 + </div> 28 + </main> 29 + </body></html> 30 + ` 31 + 32 + func TestNutritionStripped(t *testing.T) { 33 + recipe, ok := Extract(testHTML) 34 + if !ok { 35 + t.Fatal("Extract returned false") 36 + } 37 + t.Logf("Name: %q", recipe.Name) 38 + t.Logf("Yield: %q", recipe.Yield) 39 + t.Logf("ImageURL: %q", recipe.ImageURL) 40 + t.Logf("Ingredients (%d):", len(recipe.Ingredients)) 41 + for i, ing := range recipe.Ingredients { 42 + t.Logf(" [%d] RawText=%q Name=%q", i, ing.RawText, ing.Name) 43 + } 44 + t.Logf("Instructions (%d):", len(recipe.Instructions)) 45 + for i, instr := range recipe.Instructions { 46 + t.Logf(" [%d] Text=%q", i, instr.Text) 47 + } 48 + 49 + if len(recipe.Ingredients) == 0 { 50 + t.Error("expected ingredients, got none") 51 + } 52 + if len(recipe.Instructions) == 0 { 53 + t.Error("expected instructions, got none") 54 + } 55 + if recipe.Name != "Creamy Ginger Green Smoothie" { 56 + t.Errorf("unexpected name: %q", recipe.Name) 57 + } 58 + }
+48 -26
internal/extract/pipeline.go
··· 11 11 "strings" 12 12 "time" 13 13 14 + "tangled.org/dunkirk.sh/pear/internal/extract/generic" 14 15 "tangled.org/dunkirk.sh/pear/internal/extract/hrecipe" 15 16 "tangled.org/dunkirk.sh/pear/internal/extract/marmiton" 16 17 "tangled.org/dunkirk.sh/pear/internal/extract/schema" ··· 64 65 65 66 lang := detectLanguage(body) 66 67 67 - if recipe, ok := marmiton.Extract(body); ok { 68 - recipe.SourceURL = targetURL 69 - recipe.SourceDomain = domainOf(targetURL) 70 - recipe.Language = lang 71 - return &Result{Recipe: recipe} 68 + type candidate struct { 69 + recipe *models.Recipe 70 + method string 72 71 } 72 + var fallbacks []candidate 73 73 74 - if recipe, ok := wprm.Extract(body); ok { 75 - recipe.SourceURL = targetURL 76 - recipe.SourceDomain = domainOf(targetURL) 77 - recipe.Language = lang 78 - return &Result{Recipe: recipe} 74 + tryExtract := func(r *models.Recipe, ok bool, method string) *Result { 75 + if !ok || r == nil { 76 + return nil 77 + } 78 + r.SourceURL = targetURL 79 + r.SourceDomain = domainOf(targetURL) 80 + r.Language = lang 81 + r.Normalize() 82 + if len(r.Instructions) > 0 { 83 + return &Result{Recipe: r} 84 + } 85 + fallbacks = append(fallbacks, candidate{r, method}) 86 + return nil 79 87 } 80 88 81 - if recipe, ok := schema.Extract(body); ok { 82 - recipe.SourceURL = targetURL 83 - recipe.SourceDomain = domainOf(targetURL) 84 - recipe.Language = lang 85 - return &Result{Recipe: recipe} 89 + if r, ok := marmiton.Extract(body); true { 90 + if result := tryExtract(r, ok, "marmiton"); result != nil { 91 + return result 92 + } 86 93 } 87 - 88 - if recipe, ok := schema.ExtractMicrodata(body); ok { 89 - recipe.SourceURL = targetURL 90 - recipe.SourceDomain = domainOf(targetURL) 91 - recipe.Language = lang 92 - return &Result{Recipe: recipe} 94 + if r, ok := wprm.Extract(body); true { 95 + if result := tryExtract(r, ok, "wprm"); result != nil { 96 + return result 97 + } 98 + } 99 + if r, ok := schema.Extract(body); true { 100 + if result := tryExtract(r, ok, "schema.org"); result != nil { 101 + return result 102 + } 103 + } 104 + if r, ok := schema.ExtractMicrodata(body); true { 105 + if result := tryExtract(r, ok, "microdata"); result != nil { 106 + return result 107 + } 108 + } 109 + if r, ok := hrecipe.Extract(body); true { 110 + if result := tryExtract(r, ok, "h-recipe"); result != nil { 111 + return result 112 + } 113 + } 114 + if r, ok := generic.Extract(body); true { 115 + if result := tryExtract(r, ok, "generic"); result != nil { 116 + return result 117 + } 93 118 } 94 119 95 - if recipe, ok := hrecipe.Extract(body); ok { 96 - recipe.SourceURL = targetURL 97 - recipe.SourceDomain = domainOf(targetURL) 98 - recipe.Language = lang 99 - return &Result{Recipe: recipe} 120 + if len(fallbacks) > 0 { 121 + return &Result{Recipe: fallbacks[0].recipe} 100 122 } 101 123 102 124 return &Result{Error: fmt.Errorf("no recipe found on page - tried JSON-LD, microdata, and h-recipe extraction")}
+60
internal/extract/pipeline_test.go
··· 1 + package extract 2 + 3 + import ( 4 + "testing" 5 + 6 + "tangled.org/dunkirk.sh/pear/internal/extract/generic" 7 + "tangled.org/dunkirk.sh/pear/internal/extract/schema" 8 + ) 9 + 10 + func TestPipelineNutritionStripped(t *testing.T) { 11 + const html = `<!DOCTYPE html> 12 + <html><head> 13 + <script type="application/ld+json"> 14 + {"@context":"https://schema.org","@graph":[{"@type":"Article","headline":"Test"},{"@type":"WebPage","name":"Test"}]} 15 + </script> 16 + <script type="application/ld+json"> 17 + { 18 + "@context": "http://schema.org", 19 + "@type": "Recipe", 20 + "name": "Creamy Ginger Green Smoothie", 21 + "description": "A creamy smoothie", 22 + "recipeIngredient": ["2 handfuls organic spinach","1 cup filtered water"], 23 + "recipeInstructions": [] 24 + } 25 + </script> 26 + </head><body> 27 + <h1 class="recipe-title">Creamy Ginger Green Smoothie</h1> 28 + <div class="ingredients"> 29 + <p>2 handfuls organic spinach</p> 30 + <p>1 cup filtered water</p> 31 + </div> 32 + <div class="directions"> 33 + <div class="instructions"> 34 + <p>Add all ingredients to a blender and blend until creamy.</p> 35 + <p>Enjoy!</p> 36 + </div> 37 + </div> 38 + </body></html>` 39 + 40 + recipe, ok := schema.Extract(html) 41 + if !ok { 42 + t.Fatal("JSON-LD extractor should have found the recipe") 43 + } 44 + if len(recipe.Instructions) != 0 { 45 + t.Errorf("expected no instructions from JSON-LD, got %d", len(recipe.Instructions)) 46 + } 47 + t.Logf("JSON-LD: Name=%q Instructions=%d Ingredients=%d", recipe.Name, len(recipe.Instructions), len(recipe.Ingredients)) 48 + 49 + recipe2, ok2 := generic.Extract(html) 50 + if !ok2 { 51 + t.Fatal("generic extractor should have found the recipe") 52 + } 53 + if len(recipe2.Instructions) == 0 { 54 + t.Error("generic extractor should have found instructions") 55 + } 56 + if len(recipe2.Ingredients) == 0 { 57 + t.Error("generic extractor should have found ingredients") 58 + } 59 + t.Logf("Generic: Name=%q Instructions=%d Ingredients=%d", recipe2.Name, len(recipe2.Instructions), len(recipe2.Ingredients)) 60 + }
+17 -1
internal/models/recipe.go
··· 1 1 package models 2 2 3 - import "time" 3 + import ( 4 + "html" 5 + "time" 6 + ) 4 7 5 8 type Recipe struct { 6 9 Name string ··· 36 39 Recipe []byte 37 40 ExtractionMethod string 38 41 FetchedAt time.Time 42 + } 43 + 44 + func (r *Recipe) Normalize() { 45 + r.Name = html.UnescapeString(r.Name) 46 + r.Description = html.UnescapeString(r.Description) 47 + for i := range r.Ingredients { 48 + r.Ingredients[i].RawText = html.UnescapeString(r.Ingredients[i].RawText) 49 + r.Ingredients[i].Name = html.UnescapeString(r.Ingredients[i].Name) 50 + r.Ingredients[i].Group = html.UnescapeString(r.Ingredients[i].Group) 51 + } 52 + for i := range r.Instructions { 53 + r.Instructions[i].Text = html.UnescapeString(r.Instructions[i].Text) 54 + } 39 55 }