Files
webarchive/adapters/processors/internal/mediainline.go
derfenix 870f13f7bf Improved single_file processor, refactoring
Reduce inlined image size, get page metadata before save and put into processing queue
2023-11-24 14:25:17 +03:00

256 lines
5.5 KiB
Go

package internal
import (
"bytes"
"context"
"encoding/base64"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"github.com/disintegration/imaging"
"github.com/gabriel-vasile/mimetype"
"go.uber.org/zap"
"golang.org/x/net/html"
)
type MediaInline struct {
log *zap.Logger
getter func(context.Context, string) (*http.Response, error)
}
func NewMediaInline(log *zap.Logger, getter func(context.Context, string) (*http.Response, error)) *MediaInline {
return &MediaInline{log: log, getter: getter}
}
func (m *MediaInline) Inline(ctx context.Context, reader io.Reader, pageURL string) (*html.Node, error) {
htmlNode, err := html.Parse(reader)
if err != nil {
return nil, fmt.Errorf("parse response body: %w", err)
}
baseURL, err := url.Parse(pageURL)
if err != nil {
return nil, fmt.Errorf("parse page url: %w", err)
}
m.visit(ctx, htmlNode, m.processorFunc, baseURL)
return htmlNode, nil
}
func (m *MediaInline) processorFunc(ctx context.Context, node *html.Node, baseURL *url.URL) error {
switch node.Data {
case "link":
if err := m.processHref(ctx, node.Attr, baseURL); err != nil {
return fmt.Errorf("process link %s: %w", node.Attr, err)
}
case "script", "img":
if err := m.processSrc(ctx, node.Attr, baseURL); err != nil {
return fmt.Errorf("process script %s: %w", node.Attr, err)
}
case "a":
if err := m.processAHref(node.Attr, baseURL); err != nil {
return fmt.Errorf("process a href %s: %w", node.Attr, err)
}
}
return nil
}
func (m *MediaInline) processAHref(attrs []html.Attribute, baseURL *url.URL) error {
for idx, attr := range attrs {
switch attr.Key {
case "href":
attrs[idx].Val = normalizeURL(attr.Val, baseURL)
}
}
return nil
}
func (m *MediaInline) processHref(ctx context.Context, attrs []html.Attribute, baseURL *url.URL) error {
var shouldProcess bool
var value string
var valueIdx int
for idx, attr := range attrs {
switch attr.Key {
case "rel":
switch attr.Val {
case "stylesheet", "icon", "alternate icon", "shortcut icon", "manifest":
shouldProcess = true
}
case "href":
value = attr.Val
valueIdx = idx
}
}
if !shouldProcess {
return nil
}
encodedValue, err := m.loadAndEncode(ctx, baseURL, value)
if err != nil {
return err
}
attrs[valueIdx].Val = encodedValue
return nil
}
func (m *MediaInline) processSrc(ctx context.Context, attrs []html.Attribute, baseURL *url.URL) error {
var shouldProcess bool
var value string
var valueIdx int
for idx, attr := range attrs {
switch attr.Key {
case "src":
value = attr.Val
valueIdx = idx
shouldProcess = true
case "data-src":
value = attr.Val
}
}
if !shouldProcess {
return nil
}
encodedValue, err := m.loadAndEncode(ctx, baseURL, value)
if err != nil {
return err
}
attrs[valueIdx].Val = encodedValue
return nil
}
func (m *MediaInline) loadAndEncode(ctx context.Context, baseURL *url.URL, value string) (string, error) {
mime := "text/plain"
if value == "" {
return "", nil
}
normalizedURL := normalizeURL(value, baseURL)
if normalizedURL == "" {
return value, nil
}
response, err := m.getter(ctx, normalizedURL)
if err != nil {
m.log.Sugar().With(zap.Error(err)).Errorf("load %s", normalizedURL)
return value, nil
}
defer func() {
_ = response.Body.Close()
}()
cleanMime := func(s string) string {
s, _, _ = strings.Cut(s, "+")
return s
}
if ct := response.Header.Get("Content-Type"); ct != "" {
mime = ct
}
encodedVal, err := m.encodeResource(response.Body, &mime)
if err != nil {
return value, fmt.Errorf("encode resource: %w", err)
}
return fmt.Sprintf("data:%s;base64, %s", cleanMime(mime), encodedVal), nil
}
func (m *MediaInline) visit(ctx context.Context, n *html.Node, proc func(context.Context, *html.Node, *url.URL) error, baseURL *url.URL) {
if err := proc(ctx, n, baseURL); err != nil {
m.log.Error("process error", zap.Error(err))
}
if n.FirstChild != nil {
m.visit(ctx, n.FirstChild, proc, baseURL)
}
if n.NextSibling != nil {
m.visit(ctx, n.NextSibling, proc, baseURL)
}
}
func normalizeURL(resourceURL string, base *url.URL) string {
if strings.HasPrefix(resourceURL, "//") {
return "https:" + resourceURL
}
if strings.HasPrefix(resourceURL, "about:") {
return ""
}
parsedResourceURL, err := url.Parse(resourceURL)
if err != nil {
return resourceURL
}
reference := base.ResolveReference(parsedResourceURL)
return reference.String()
}
func (m *MediaInline) encodeResource(r io.Reader, mime *string) (string, error) {
all, err := io.ReadAll(r)
if err != nil {
return "", fmt.Errorf("read data: %w", err)
}
all, err = m.preprocessResource(all, mime)
if err != nil {
return "", fmt.Errorf("preprocess resource: %w", err)
}
return base64.StdEncoding.EncodeToString(all), nil
}
func (m *MediaInline) preprocessResource(data []byte, mime *string) ([]byte, error) {
detectedMime := mimetype.Detect(data)
switch {
case strings.HasPrefix(detectedMime.String(), "image"):
decodedImage, err := imaging.Decode(bytes.NewBuffer(data))
if err != nil {
m.log.Error("failed to decode image", zap.Error(err))
return data, nil
}
if size := decodedImage.Bounds().Size(); size.X > 1024 || size.Y > 1024 {
thumbnail := imaging.Thumbnail(decodedImage, 1024, 1024, imaging.Lanczos)
buf := bytes.NewBuffer(nil)
if err := imaging.Encode(buf, thumbnail, imaging.JPEG, imaging.JPEGQuality(90)); err != nil {
m.log.Error("failed to create resized image", zap.Error(err))
return data, nil
}
*mime = "image/jpeg"
m.log.Info("Resized")
return buf.Bytes(), nil
}
}
return data, nil
}