add description & image for articles

This commit is contained in:
2025-12-23 17:50:55 +03:00
parent 869a513773
commit cdc68c1b53
4 changed files with 120 additions and 24 deletions

View File

@@ -20,22 +20,31 @@ func downloadAssetFile(url string) ([]byte, error) {
return io.ReadAll(resp.Body)
}
// FetchArticleHTML returns page's title, html and error
func FetchArticleHTML(urlToFetch string) (string, string, error) {
type page struct {
Title string
Description string
Body string
ImageURL string
}
// FetchArticleHTML returns page struct and error
func FetchArticleHTML(urlToFetch string) (page, error) {
page := page{}
url, err := url.Parse(urlToFetch)
if err != nil {
return "", "", err
return page, err
}
resp, err := http.Get(url.String())
if err != nil {
return "", "", err
return page, err
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return "", "", err
return page, err
}
doc.Find(`head link[rel="stylesheet"]`).Each(func(i int, s *goquery.Selection) {
@@ -65,12 +74,59 @@ func FetchArticleHTML(urlToFetch string) (string, string, error) {
}
})
doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, ok := s.Attr("src")
if !ok || src == "" {
return
}
if strings.HasPrefix(src, "/") {
s.SetAttr("src", "https://"+url.Hostname()+src)
}
})
doc.Find("script").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
title := doc.Find("head title").Text()
page.Title = doc.Find("head title").Text()
page.Body, _ = doc.Html()
html, _ := doc.Html()
return title, html, err
page.Description = doc.Find(`head meta[name="description"]`).AttrOr("content", "")
if page.Description == "" {
page.Description = doc.Find(`head meta[property="og:description"]`).AttrOr("content", "")
}
page.ImageURL = doc.Find(`head meta[property="og:image"]`).AttrOr("content", "")
if page.ImageURL == "" {
doc.Find("img").Each(func(i int, s *goquery.Selection) {
if page.ImageURL != "" {
return
}
src, exists := s.Attr("src")
if !exists {
return
}
u, err := url.Parse(src)
if err != nil {
return
}
allowedTypes := []string{"png", "jpg", "jpeg", "webp"}
isAllowed := false
for _, tp := range allowedTypes {
if strings.HasSuffix(u.Path, tp) {
isAllowed = true
}
}
if isAllowed {
page.ImageURL = src
}
})
}
return page, err
}