diff --git a/.gitignore b/.gitignore index 2ed58d1..e05e1c4 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,5 @@ fabric.properties go.work test.http db +http-client.env.json +http-client.private.env.json diff --git a/.idea/swagger-settings.xml b/.idea/swagger-settings.xml new file mode 100644 index 0000000..01d844c --- /dev/null +++ b/.idea/swagger-settings.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/adapters/processors/internal/mediainline.go b/adapters/processors/internal/mediainline.go new file mode 100644 index 0000000..dc1e788 --- /dev/null +++ b/adapters/processors/internal/mediainline.go @@ -0,0 +1,255 @@ +package internal + +import ( + "bytes" + "context" + "encoding/base64" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/disintegration/imaging" + "github.com/gabriel-vasile/mimetype" + "go.uber.org/zap" + "golang.org/x/net/html" +) + +type MediaInline struct { + log *zap.Logger + getter func(context.Context, string) (*http.Response, error) +} + +func NewMediaInline(log *zap.Logger, getter func(context.Context, string) (*http.Response, error)) *MediaInline { + return &MediaInline{log: log, getter: getter} +} + +func (m *MediaInline) Inline(ctx context.Context, reader io.Reader, pageURL string) (*html.Node, error) { + htmlNode, err := html.Parse(reader) + if err != nil { + return nil, fmt.Errorf("parse response body: %w", err) + } + + baseURL, err := url.Parse(pageURL) + if err != nil { + return nil, fmt.Errorf("parse page url: %w", err) + } + + m.visit(ctx, htmlNode, m.processorFunc, baseURL) + + return htmlNode, nil +} + +func (m *MediaInline) processorFunc(ctx context.Context, node *html.Node, baseURL *url.URL) error { + switch node.Data { + case "link": + if err := m.processHref(ctx, node.Attr, baseURL); err != nil { + return fmt.Errorf("process link %s: %w", node.Attr, err) + } + + case "script", "img": + if err := m.processSrc(ctx, node.Attr, baseURL); err != nil { + return fmt.Errorf("process script %s: %w", node.Attr, err) + } + + case "a": + if err := m.processAHref(node.Attr, baseURL); err != nil { + return fmt.Errorf("process a href %s: %w", node.Attr, err) + } + } + + return nil +} + +func (m *MediaInline) processAHref(attrs []html.Attribute, baseURL *url.URL) error { + for idx, attr := range attrs { + switch attr.Key { + case "href": + attrs[idx].Val = normalizeURL(attr.Val, baseURL) + } + } + + return nil +} + +func (m *MediaInline) processHref(ctx context.Context, attrs []html.Attribute, baseURL *url.URL) error { + var shouldProcess bool + var value string + var valueIdx int + + for idx, attr := range attrs { + switch attr.Key { + case "rel": + switch attr.Val { + case "stylesheet", "icon", "alternate icon", "shortcut icon", "manifest": + shouldProcess = true + } + + case "href": + value = attr.Val + valueIdx = idx + } + } + + if !shouldProcess { + return nil + } + + encodedValue, err := m.loadAndEncode(ctx, baseURL, value) + if err != nil { + return err + } + + attrs[valueIdx].Val = encodedValue + + return nil +} + +func (m *MediaInline) processSrc(ctx context.Context, attrs []html.Attribute, baseURL *url.URL) error { + var shouldProcess bool + var value string + var valueIdx int + + for idx, attr := range attrs { + switch attr.Key { + case "src": + value = attr.Val + valueIdx = idx + shouldProcess = true + case "data-src": + value = attr.Val + } + } + + if !shouldProcess { + return nil + } + + encodedValue, err := m.loadAndEncode(ctx, baseURL, value) + if err != nil { + return err + } + + attrs[valueIdx].Val = encodedValue + + return nil +} + +func (m *MediaInline) loadAndEncode(ctx context.Context, baseURL *url.URL, value string) (string, error) { + mime := "text/plain" + + if value == "" { + return "", nil + } + + normalizedURL := normalizeURL(value, baseURL) + if normalizedURL == "" { + return value, nil + } + + response, err := m.getter(ctx, normalizedURL) + if err != nil { + m.log.Sugar().With(zap.Error(err)).Errorf("load %s", normalizedURL) + return value, nil + } + + defer func() { + _ = response.Body.Close() + }() + + cleanMime := func(s string) string { + s, _, _ = strings.Cut(s, "+") + return s + } + + if ct := response.Header.Get("Content-Type"); ct != "" { + mime = ct + } + + encodedVal, err := m.encodeResource(response.Body, &mime) + if err != nil { + return value, fmt.Errorf("encode resource: %w", err) + } + + return fmt.Sprintf("data:%s;base64, %s", cleanMime(mime), encodedVal), nil +} + +func (m *MediaInline) visit(ctx context.Context, n *html.Node, proc func(context.Context, *html.Node, *url.URL) error, baseURL *url.URL) { + if err := proc(ctx, n, baseURL); err != nil { + m.log.Error("process error", zap.Error(err)) + } + + if n.FirstChild != nil { + m.visit(ctx, n.FirstChild, proc, baseURL) + } + + if n.NextSibling != nil { + m.visit(ctx, n.NextSibling, proc, baseURL) + } +} + +func normalizeURL(resourceURL string, base *url.URL) string { + if strings.HasPrefix(resourceURL, "//") { + return "https:" + resourceURL + } + + if strings.HasPrefix(resourceURL, "about:") { + return "" + } + + parsedResourceURL, err := url.Parse(resourceURL) + if err != nil { + return resourceURL + } + + reference := base.ResolveReference(parsedResourceURL) + + return reference.String() +} + +func (m *MediaInline) encodeResource(r io.Reader, mime *string) (string, error) { + all, err := io.ReadAll(r) + if err != nil { + return "", fmt.Errorf("read data: %w", err) + } + + all, err = m.preprocessResource(all, mime) + if err != nil { + return "", fmt.Errorf("preprocess resource: %w", err) + } + + return base64.StdEncoding.EncodeToString(all), nil +} + +func (m *MediaInline) preprocessResource(data []byte, mime *string) ([]byte, error) { + detectedMime := mimetype.Detect(data) + + switch { + case strings.HasPrefix(detectedMime.String(), "image"): + decodedImage, err := imaging.Decode(bytes.NewBuffer(data)) + if err != nil { + m.log.Error("failed to decode image", zap.Error(err)) + + return data, nil + } + + if size := decodedImage.Bounds().Size(); size.X > 1024 || size.Y > 1024 { + thumbnail := imaging.Thumbnail(decodedImage, 1024, 1024, imaging.Lanczos) + buf := bytes.NewBuffer(nil) + + if err := imaging.Encode(buf, thumbnail, imaging.JPEG, imaging.JPEGQuality(90)); err != nil { + m.log.Error("failed to create resized image", zap.Error(err)) + + return data, nil + } + + *mime = "image/jpeg" + m.log.Info("Resized") + + return buf.Bytes(), nil + } + } + + return data, nil +} diff --git a/adapters/processors/pdf.go b/adapters/processors/pdf.go index f835490..cfca378 100644 --- a/adapters/processors/pdf.go +++ b/adapters/processors/pdf.go @@ -40,9 +40,9 @@ func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]ent opts := wkhtmltopdf.NewPageOptions() opts.PrintMediaType.Set(p.cfg.MediaPrint) opts.JavascriptDelay.Set(200) - opts.DisableJavascript.Set(true) + opts.DisableJavascript.Set(false) opts.LoadErrorHandling.Set("ignore") - opts.LoadMediaErrorHandling.Set("ignore") + opts.LoadMediaErrorHandling.Set("skip") opts.FooterRight.Set("[opts]") opts.HeaderLeft.Set(url) opts.HeaderRight.Set(time.Now().Format(time.DateOnly)) @@ -50,9 +50,9 @@ func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]ent opts.Zoom.Set(p.cfg.Zoom) opts.ViewportSize.Set(p.cfg.Viewport) opts.NoBackground.Set(true) - opts.DisableLocalFileAccess.Set(true) - opts.DisableExternalLinks.Set(true) - opts.DisableInternalLinks.Set(true) + opts.DisableLocalFileAccess.Set(false) + opts.DisableExternalLinks.Set(false) + opts.DisableInternalLinks.Set(false) var page wkhtmltopdf.PageProvider if len(cache.Get()) > 0 { diff --git a/adapters/processors/processors.go b/adapters/processors/processors.go index bddabac..ed1e096 100644 --- a/adapters/processors/processors.go +++ b/adapters/processors/processors.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "go.uber.org/zap" "golang.org/x/net/html" "github.com/derfenix/webarchive/config" @@ -22,7 +23,7 @@ type processor interface { Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) } -func NewProcessors(cfg config.Config) (*Processors, error) { +func NewProcessors(cfg config.Config, log *zap.Logger) (*Processors, error) { jar, err := cookiejar.New(&cookiejar.Options{ PublicSuffixList: nil, }) @@ -62,7 +63,7 @@ func NewProcessors(cfg config.Config) (*Processors, error) { processors: map[entity.Format]processor{ entity.FormatHeaders: NewHeaders(httpClient), entity.FormatPDF: NewPDF(cfg.PDF), - entity.FormatSingleFile: NewSingleFile(httpClient), + entity.FormatSingleFile: NewSingleFile(httpClient, log), }, } diff --git a/adapters/processors/processors_test.go b/adapters/processors/processors_test.go index 82157f7..53be71e 100644 --- a/adapters/processors/processors_test.go +++ b/adapters/processors/processors_test.go @@ -6,6 +6,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" "github.com/derfenix/webarchive/config" "github.com/derfenix/webarchive/entity" @@ -18,7 +19,7 @@ func TestProcessors_GetMeta(t *testing.T) { cfg, err := config.NewConfig(ctx) require.NoError(t, err) - procs, err := NewProcessors(cfg) + procs, err := NewProcessors(cfg, zaptest.NewLogger(t)) require.NoError(t, err) cache := entity.NewCache() diff --git a/adapters/processors/singlefile.go b/adapters/processors/singlefile.go index 681e4ca..419ae75 100644 --- a/adapters/processors/singlefile.go +++ b/adapters/processors/singlefile.go @@ -5,50 +5,46 @@ import ( "context" "fmt" "net/http" - "net/url" + "go.uber.org/zap" "golang.org/x/net/html" + "github.com/derfenix/webarchive/adapters/processors/internal" "github.com/derfenix/webarchive/entity" ) -func NewSingleFile(client *http.Client) *SingleFile { - return &SingleFile{client: client} +func NewSingleFile(client *http.Client, log *zap.Logger) *SingleFile { + return &SingleFile{client: client, log: log} } type SingleFile struct { client *http.Client + log *zap.Logger } -func (s *SingleFile) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) { +func (s *SingleFile) Process(ctx context.Context, pageURL string, cache *entity.Cache) ([]entity.File, error) { reader := cache.Reader() if reader == nil { - response, err := s.get(ctx, url) + response, err := s.get(ctx, pageURL) if err != nil { return nil, err } - if response.Body != nil { - defer func() { - _ = response.Body.Close() - }() - } + defer func() { + _ = response.Body.Close() + }() reader = response.Body } - htmlNode, err := html.Parse(reader) + inlinedHTML, err := internal.NewMediaInline(s.log, s.get).Inline(ctx, reader, pageURL) if err != nil { - return nil, fmt.Errorf("parse response body: %w", err) - } - - if err := s.process(ctx, htmlNode, url); err != nil { - return nil, fmt.Errorf("process: %w", err) + return nil, fmt.Errorf("inline media: %w", err) } buf := bytes.NewBuffer(nil) - if err := html.Render(buf, htmlNode); err != nil { + if err := html.Render(buf, inlinedHTML); err != nil { return nil, fmt.Errorf("render result html: %w", err) } @@ -78,59 +74,3 @@ func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error return response, nil } - -func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string) error { - parsedURL, err := url.Parse(pageURL) - if err != nil { - return fmt.Errorf("parse page url: %w", err) - } - - baseURL := fmt.Sprintf("%s://%s", parsedURL.Scheme, parsedURL.Host) - - for child := node.FirstChild; child != nil; child = child.NextSibling { - var err error - switch child.Data { - case "head": - err = s.processHead(ctx, child, baseURL) - - case "body": - err = s.processBody(ctx, child, baseURL) - } - - if err != nil { - return err - } - } - - return nil -} - -func (s *SingleFile) processHead(ctx context.Context, node *html.Node, baseURL string) error { - for child := node.FirstChild; child != nil; child = child.NextSibling { - switch child.Data { - case "link": - if err := s.processHref(ctx, child.Attr, baseURL); err != nil { - return fmt.Errorf("process link %s: %w", child.Attr, err) - } - - case "script": - if err := s.processSrc(ctx, child.Attr, baseURL); err != nil { - return fmt.Errorf("process script %s: %w", child.Attr, err) - } - } - } - - return nil -} - -func (s *SingleFile) processBody(ctx context.Context, child *html.Node, url string) error { - return nil -} - -func (s *SingleFile) processHref(ctx context.Context, attrs []html.Attribute, baseURL string) error { - return nil -} - -func (s *SingleFile) processSrc(ctx context.Context, attrs []html.Attribute, baseURL string) error { - return nil -} diff --git a/adapters/repository/badger/page.go b/adapters/repository/badger/page.go index a6dcd7f..845d652 100644 --- a/adapters/repository/badger/page.go +++ b/adapters/repository/badger/page.go @@ -192,7 +192,7 @@ func (p *Page) ListUnprocessed(ctx context.Context) ([]entity.Page, error) { return fmt.Errorf("get item: %w", err) } - if page.Status == entity.StatusNew { + if page.Status == entity.StatusNew || page.Status == entity.StatusProcessing { //goland:noinspection GoVetCopyLock pages = append(pages, page) //nolint:govet // didn't touch the lock here } diff --git a/application/application.go b/application/application.go index 70b4cce..3bbbeb8 100644 --- a/application/application.go +++ b/application/application.go @@ -10,13 +10,14 @@ import ( "sync" "time" - "github.com/derfenix/webarchive/adapters/repository" "github.com/dgraph-io/badger/v4" "github.com/ogen-go/ogen/middleware" "go.uber.org/multierr" "go.uber.org/zap" "go.uber.org/zap/zapcore" + "github.com/derfenix/webarchive/adapters/repository" + "github.com/derfenix/webarchive/adapters/processors" badgerRepo "github.com/derfenix/webarchive/adapters/repository/badger" "github.com/derfenix/webarchive/api/openapi" @@ -41,7 +42,7 @@ func NewApplication(cfg config.Config) (Application, error) { return Application{}, fmt.Errorf("new page repo: %w", err) } - processor, err := processors.NewProcessors(cfg) + processor, err := processors.NewProcessors(cfg, log.Named("processor")) if err != nil { return Application{}, fmt.Errorf("new processors: %w", err) } @@ -50,7 +51,7 @@ func NewApplication(cfg config.Config) (Application, error) { worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker")) server, err := openapi.NewServer( - rest.NewService(pageRepo, workerCh), + rest.NewService(pageRepo, workerCh, processor), openapi.WithPathPrefix("/api/v1"), openapi.WithMiddleware( func(r middleware.Request, next middleware.Next) (middleware.Response, error) { @@ -190,6 +191,7 @@ func newLogger(cfg config.Logging) (*zap.Logger, error) { logCfg.EncoderConfig.EncodeTime = zapcore.RFC3339TimeEncoder logCfg.EncoderConfig.EncodeDuration = zapcore.NanosDurationEncoder logCfg.DisableCaller = true + logCfg.DisableStacktrace = true logCfg.Level = zap.NewAtomicLevelAt(zapcore.InfoLevel) if cfg.Debug { diff --git a/entity/page.go b/entity/page.go index cefedd3..ddeaf56 100644 --- a/entity/page.go +++ b/entity/page.go @@ -3,6 +3,7 @@ package entity import ( "context" "fmt" + "runtime/debug" "sync" "time" @@ -80,16 +81,18 @@ func (p *Page) SetProcessing() { p.Status = StatusProcessing } -func (p *Page) Process(ctx context.Context, processor Processor) { - innerWG := sync.WaitGroup{} - innerWG.Add(len(p.Formats)) - +func (p *Page) Prepare(ctx context.Context, processor Processor) { meta, err := processor.GetMeta(ctx, p.URL, p.cache) if err != nil { p.Meta.Error = err.Error() } else { p.Meta = meta } +} + +func (p *Page) Process(ctx context.Context, processor Processor) { + innerWG := sync.WaitGroup{} + innerWG.Add(len(p.Formats)) results := Results{} @@ -99,7 +102,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) { defer func() { if err := recover(); err != nil { - results.Add(Result{Format: format, Err: fmt.Errorf("recovered from panic: %v", err)}) + results.Add(Result{Format: format, Err: fmt.Errorf("recovered from panic: %v (%s)", err, string(debug.Stack()))}) } }() diff --git a/entity/worker.go b/entity/worker.go index bf8b333..6719e94 100644 --- a/entity/worker.go +++ b/entity/worker.go @@ -66,6 +66,16 @@ func (w *Worker) Start(ctx context.Context, wg *sync.WaitGroup) { func (w *Worker) do(ctx context.Context, wg *sync.WaitGroup, page *Page, log *zap.Logger) { defer wg.Done() + page.SetProcessing() + if err := w.pages.Save(ctx, page); err != nil { + w.log.Error( + "failed to save processing page", + zap.String("page_id", page.ID.String()), + zap.String("page_url", page.URL), + zap.Error(err), + ) + } + page.Process(ctx, w.processor) log.Debug("page processed") diff --git a/go.mod b/go.mod index 2e2c828..c08b95a 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dgraph-io/ristretto v0.1.1 // indirect + github.com/disintegration/imaging v1.6.2 // indirect github.com/dlclark/regexp2 v1.10.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/fatih/color v1.15.0 // indirect @@ -58,6 +59,7 @@ require ( go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.14.0 // indirect golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 // indirect + golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 // indirect golang.org/x/sync v0.4.0 // indirect golang.org/x/sys v0.13.0 // indirect golang.org/x/text v0.13.0 // indirect diff --git a/go.sum b/go.sum index 9819d1b..925b13a 100644 --- a/go.sum +++ b/go.sum @@ -21,6 +21,8 @@ github.com/dgraph-io/ristretto v0.1.1 h1:6CWw5tJNgpegArSHpNHJKldNeq03FQCwYvfMVWa github.com/dgraph-io/ristretto v0.1.1/go.mod h1:S1GPSBCYCIhmVNfcth17y2zZtQT6wzkzgwUve0VDWWA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= +github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c= +github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= @@ -169,6 +171,8 @@ golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 h1:Di6/M8l0O2lCLc6VVRWhgCiApHV8MnQurBnFSHsQtNY= golang.org/x/exp v0.0.0-20230725093048-515e97ebf090/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= +golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 h1:hVwzHzIUGRjiF7EcUjqNxk3NCfkPxbDKRdnNE1Rpg0U= +golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= diff --git a/ports/rest/service.go b/ports/rest/service.go index c8ed763..5068e50 100644 --- a/ports/rest/service.go +++ b/ports/rest/service.go @@ -20,17 +20,19 @@ type Pages interface { GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error) } -func NewService(pages Pages, ch chan *entity.Page) *Service { +func NewService(pages Pages, ch chan *entity.Page, processor entity.Processor) *Service { return &Service{ - pages: pages, - ch: ch, + pages: pages, + ch: ch, + processor: processor, } } type Service struct { openapi.UnimplementedHandler - pages Pages - ch chan *entity.Page + processor entity.Processor + pages Pages + ch chan *entity.Page } func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) { @@ -79,7 +81,8 @@ func (s *Service) AddPage(ctx context.Context, req openapi.OptAddPageReq, params } page := entity.NewPage(url, description, domainFormats...) - page.Status = entity.StatusProcessing + page.Status = entity.StatusNew + page.Prepare(ctx, s.processor) if err := s.pages.Save(ctx, page); err != nil { return nil, fmt.Errorf("save page: %w", err)