package app import ( "fmt" "io" "net/http" "net/url" "strings" "github.com/PuerkitoBio/goquery" ) func downloadAssetFile(url string) ([]byte, error) { resp, err := http.Get(url) if err != nil { return nil, err } defer resp.Body.Close() return io.ReadAll(resp.Body) } type page struct { Title string Description string Body string ImageURL string } // FetchArticleHTML returns page struct and error func FetchArticleHTML(urlToFetch string) (page, error) { page := page{} url, err := url.Parse(urlToFetch) if err != nil { return page, err } resp, err := http.Get(url.String()) if err != nil { return page, err } defer resp.Body.Close() doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return page, err } doc.Find(`head link[rel="stylesheet"]`).Each(func(i int, s *goquery.Selection) { v, ok := s.Attr("href") if !ok || v == "" { return } if strings.HasPrefix(v, "/") { styles, err := downloadAssetFile(fmt.Sprintf("https://%s%s", url.Hostname(), v)) if err != nil { return } doc.Find("head").AppendHtml("") s.Remove() } }) doc.Find("a").Each(func(i int, s *goquery.Selection) { href, ok := s.Attr("href") if !ok || href == "" { return } if strings.HasPrefix(href, "/") { s.SetAttr("href", "https://"+url.Hostname()+href) } }) doc.Find("img").Each(func(i int, s *goquery.Selection) { src, ok := s.Attr("src") if !ok || src == "" { return } if strings.HasPrefix(src, "/") { s.SetAttr("src", "https://"+url.Hostname()+src) } }) doc.Find("script").Each(func(i int, s *goquery.Selection) { s.Remove() }) page.Title = doc.Find("head title").Text() page.Body, _ = doc.Html() page.Description = doc.Find(`head meta[name="description"]`).AttrOr("content", "") if page.Description == "" { page.Description = doc.Find(`head meta[property="og:description"]`).AttrOr("content", "") } page.ImageURL = doc.Find(`head meta[property="og:image"]`).AttrOr("content", "") if page.ImageURL == "" { doc.Find("img").Each(func(i int, s *goquery.Selection) { if page.ImageURL != "" { return } src, exists := s.Attr("src") if !exists { return } u, err := url.Parse(src) if err != nil { return } allowedTypes := []string{"png", "jpg", "jpeg", "webp"} isAllowed := false for _, tp := range allowedTypes { if strings.HasSuffix(u.Path, tp) { isAllowed = true } } if isAllowed { page.ImageURL = src } }) } return page, err }