133 lines
2.5 KiB
Go
133 lines
2.5 KiB
Go
package app
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
func downloadAssetFile(url string) ([]byte, error) {
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
return io.ReadAll(resp.Body)
|
|
}
|
|
|
|
type page struct {
|
|
Title string
|
|
Description string
|
|
Body string
|
|
ImageURL string
|
|
}
|
|
|
|
// FetchArticleHTML returns page struct and error
|
|
func FetchArticleHTML(urlToFetch string) (page, error) {
|
|
page := page{}
|
|
|
|
url, err := url.Parse(urlToFetch)
|
|
if err != nil {
|
|
return page, err
|
|
}
|
|
|
|
resp, err := http.Get(url.String())
|
|
if err != nil {
|
|
return page, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
if err != nil {
|
|
return page, err
|
|
}
|
|
|
|
doc.Find(`head link[rel="stylesheet"]`).Each(func(i int, s *goquery.Selection) {
|
|
v, ok := s.Attr("href")
|
|
if !ok || v == "" {
|
|
return
|
|
}
|
|
|
|
if strings.HasPrefix(v, "/") {
|
|
styles, err := downloadAssetFile(fmt.Sprintf("https://%s%s", url.Hostname(), v))
|
|
if err != nil {
|
|
return
|
|
}
|
|
doc.Find("head").AppendHtml("<style>" + string(styles) + "</style>")
|
|
s.Remove()
|
|
}
|
|
})
|
|
|
|
doc.Find("a").Each(func(i int, s *goquery.Selection) {
|
|
href, ok := s.Attr("href")
|
|
if !ok || href == "" {
|
|
return
|
|
}
|
|
|
|
if strings.HasPrefix(href, "/") {
|
|
s.SetAttr("href", "https://"+url.Hostname()+href)
|
|
}
|
|
})
|
|
|
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
|
src, ok := s.Attr("src")
|
|
if !ok || src == "" {
|
|
return
|
|
}
|
|
|
|
if strings.HasPrefix(src, "/") {
|
|
s.SetAttr("src", "https://"+url.Hostname()+src)
|
|
}
|
|
})
|
|
|
|
doc.Find("script").Each(func(i int, s *goquery.Selection) {
|
|
s.Remove()
|
|
})
|
|
|
|
page.Title = doc.Find("head title").Text()
|
|
page.Body, _ = doc.Html()
|
|
|
|
page.Description = doc.Find(`head meta[name="description"]`).AttrOr("content", "")
|
|
if page.Description == "" {
|
|
page.Description = doc.Find(`head meta[property="og:description"]`).AttrOr("content", "")
|
|
}
|
|
|
|
page.ImageURL = doc.Find(`head meta[property="og:image"]`).AttrOr("content", "")
|
|
if page.ImageURL == "" {
|
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
|
if page.ImageURL != "" {
|
|
return
|
|
}
|
|
|
|
src, exists := s.Attr("src")
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
u, err := url.Parse(src)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
allowedTypes := []string{"png", "jpg", "jpeg", "webp"}
|
|
isAllowed := false
|
|
for _, tp := range allowedTypes {
|
|
if strings.HasSuffix(u.Path, tp) {
|
|
isAllowed = true
|
|
}
|
|
}
|
|
|
|
if isAllowed {
|
|
page.ImageURL = src
|
|
}
|
|
})
|
|
}
|
|
|
|
return page, err
|
|
}
|