From 9912b7e436cae0d898dd3d26bda3b5fec0980983 Mon Sep 17 00:00:00 2001 From: derfenix Date: Thu, 16 Nov 2023 23:46:01 +0300 Subject: [PATCH] Fix reduce network calls count for the target url --- adapters/processors/headers.go | 2 +- adapters/processors/pdf.go | 37 +++++++++++++++-------- adapters/processors/processors.go | 13 +++++--- adapters/processors/processors_test.go | 5 ++- adapters/processors/singlefile.go | 32 +++++++++++++------- entity/cache.go | 42 ++++++++++++++++++++++++++ entity/page.go | 10 +++--- 7 files changed, 106 insertions(+), 35 deletions(-) create mode 100644 entity/cache.go diff --git a/adapters/processors/headers.go b/adapters/processors/headers.go index cf1c366..a25e1a9 100644 --- a/adapters/processors/headers.go +++ b/adapters/processors/headers.go @@ -17,7 +17,7 @@ type Headers struct { client *http.Client } -func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) { +func (h *Headers) Process(ctx context.Context, url string, _ *entity.Cache) ([]entity.File, error) { var ( headersFile entity.File err error diff --git a/adapters/processors/pdf.go b/adapters/processors/pdf.go index 337695e..f835490 100644 --- a/adapters/processors/pdf.go +++ b/adapters/processors/pdf.go @@ -19,7 +19,7 @@ type PDF struct { cfg config.PDF } -func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) { +func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]entity.File, error) { gen, err := wkhtmltopdf.NewPDFGenerator() if err != nil { return nil, fmt.Errorf("new pdf generator: %w", err) @@ -37,18 +37,29 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) { gen.Grayscale.Set(p.cfg.Grayscale) gen.Title.Set(url) - page := wkhtmltopdf.NewPage(url) - page.PrintMediaType.Set(p.cfg.MediaPrint) - page.JavascriptDelay.Set(200) - page.LoadErrorHandling.Set("ignore") - page.LoadMediaErrorHandling.Set("ignore") - page.FooterRight.Set("[page]") - page.HeaderLeft.Set(url) - page.HeaderRight.Set(time.Now().Format(time.DateOnly)) - page.FooterFontSize.Set(10) - page.Zoom.Set(p.cfg.Zoom) - page.ViewportSize.Set(p.cfg.Viewport) - page.NoBackground.Set(true) + opts := wkhtmltopdf.NewPageOptions() + opts.PrintMediaType.Set(p.cfg.MediaPrint) + opts.JavascriptDelay.Set(200) + opts.DisableJavascript.Set(true) + opts.LoadErrorHandling.Set("ignore") + opts.LoadMediaErrorHandling.Set("ignore") + opts.FooterRight.Set("[opts]") + opts.HeaderLeft.Set(url) + opts.HeaderRight.Set(time.Now().Format(time.DateOnly)) + opts.FooterFontSize.Set(10) + opts.Zoom.Set(p.cfg.Zoom) + opts.ViewportSize.Set(p.cfg.Viewport) + opts.NoBackground.Set(true) + opts.DisableLocalFileAccess.Set(true) + opts.DisableExternalLinks.Set(true) + opts.DisableInternalLinks.Set(true) + + var page wkhtmltopdf.PageProvider + if len(cache.Get()) > 0 { + page = &wkhtmltopdf.PageReader{Input: cache.Reader(), PageOptions: opts} + } else { + page = &wkhtmltopdf.Page{Input: url, PageOptions: opts} + } gen.AddPage(page) diff --git a/adapters/processors/processors.go b/adapters/processors/processors.go index f17aff8..bddabac 100644 --- a/adapters/processors/processors.go +++ b/adapters/processors/processors.go @@ -3,6 +3,7 @@ package processors import ( "context" "fmt" + "io" "net" "net/http" "net/http/cookiejar" @@ -18,7 +19,7 @@ import ( const defaultEncoding = "utf-8" type processor interface { - Process(ctx context.Context, url string) ([]entity.File, error) + Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) } func NewProcessors(cfg config.Config) (*Processors, error) { @@ -73,7 +74,7 @@ type Processors struct { client *http.Client } -func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result { +func (p *Processors) Process(ctx context.Context, format entity.Format, url string, cache *entity.Cache) entity.Result { result := entity.Result{Format: format} proc, ok := p.processors[format] @@ -83,7 +84,7 @@ func (p *Processors) Process(ctx context.Context, format entity.Format, url stri return result } - files, err := proc.Process(ctx, url) + files, err := proc.Process(ctx, url, cache) if err != nil { result.Err = fmt.Errorf("process: %w", err) @@ -101,7 +102,7 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err return nil } -func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) { +func (p *Processors) GetMeta(ctx context.Context, url string, cache *entity.Cache) (entity.Meta, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return entity.Meta{}, fmt.Errorf("new request: %w", err) @@ -124,7 +125,9 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro _ = response.Body.Close() }() - htmlNode, err := html.Parse(response.Body) + tee := io.TeeReader(response.Body, cache) + + htmlNode, err := html.Parse(tee) if err != nil { return entity.Meta{}, fmt.Errorf("parse response body: %w", err) } diff --git a/adapters/processors/processors_test.go b/adapters/processors/processors_test.go index d226477..82157f7 100644 --- a/adapters/processors/processors_test.go +++ b/adapters/processors/processors_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/require" "github.com/derfenix/webarchive/config" + "github.com/derfenix/webarchive/entity" ) func TestProcessors_GetMeta(t *testing.T) { @@ -20,7 +21,9 @@ func TestProcessors_GetMeta(t *testing.T) { procs, err := NewProcessors(cfg) require.NoError(t, err) - meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/") + cache := entity.NewCache() + + meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/", cache) require.NoError(t, err) assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title) } diff --git a/adapters/processors/singlefile.go b/adapters/processors/singlefile.go index 5b22cd0..681e4ca 100644 --- a/adapters/processors/singlefile.go +++ b/adapters/processors/singlefile.go @@ -7,8 +7,9 @@ import ( "net/http" "net/url" - "github.com/derfenix/webarchive/entity" "golang.org/x/net/html" + + "github.com/derfenix/webarchive/entity" ) func NewSingleFile(client *http.Client) *SingleFile { @@ -19,21 +20,30 @@ type SingleFile struct { client *http.Client } -func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, error) { - response, err := s.get(ctx, url) - if err != nil { - return nil, err +func (s *SingleFile) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) { + reader := cache.Reader() + + if reader == nil { + response, err := s.get(ctx, url) + if err != nil { + return nil, err + } + + if response.Body != nil { + defer func() { + _ = response.Body.Close() + }() + } + + reader = response.Body } - htmlNode, err := html.Parse(response.Body) + htmlNode, err := html.Parse(reader) if err != nil { - _ = response.Body.Close() return nil, fmt.Errorf("parse response body: %w", err) } - _ = response.Body.Close() - - if err := s.process(ctx, htmlNode, url, response.Header); err != nil { + if err := s.process(ctx, htmlNode, url); err != nil { return nil, fmt.Errorf("process: %w", err) } @@ -69,7 +79,7 @@ func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error return response, nil } -func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string, headers http.Header) error { +func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string) error { parsedURL, err := url.Parse(pageURL) if err != nil { return fmt.Errorf("parse page url: %w", err) diff --git a/entity/cache.go b/entity/cache.go new file mode 100644 index 0000000..fa41c8d --- /dev/null +++ b/entity/cache.go @@ -0,0 +1,42 @@ +package entity + +import ( + "bytes" + "io" + "sync" +) + +func NewCache() *Cache { + return &Cache{data: make([]byte, 0, 1024*512)} +} + +type Cache struct { + mu sync.RWMutex + data []byte +} + +func (c *Cache) Write(p []byte) (n int, err error) { + c.mu.Lock() + c.data = append(c.data, p...) + c.mu.Unlock() + + return len(p), nil +} + +func (c *Cache) Get() []byte { + c.mu.RLock() + defer c.mu.RUnlock() + + return c.data +} + +func (c *Cache) Reader() io.Reader { + c.mu.RLock() + defer c.mu.RUnlock() + + if len(c.data) == 0 { + return nil + } + + return bytes.NewBuffer(c.data) +} diff --git a/entity/page.go b/entity/page.go index 6ef14ff..cefedd3 100644 --- a/entity/page.go +++ b/entity/page.go @@ -10,8 +10,8 @@ import ( ) type Processor interface { - Process(ctx context.Context, format Format, url string) Result - GetMeta(ctx context.Context, url string) (Meta, error) + Process(ctx context.Context, format Format, url string, cache *Cache) Result + GetMeta(ctx context.Context, url string, cache *Cache) (Meta, error) } type Format uint8 @@ -66,12 +66,14 @@ func NewPage(url string, description string, formats ...Format) *Page { Created: time.Now(), Version: 1, }, + cache: NewCache(), } } type Page struct { PageBase Results ResultsRO + cache *Cache } func (p *Page) SetProcessing() { @@ -82,7 +84,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) { innerWG := sync.WaitGroup{} innerWG.Add(len(p.Formats)) - meta, err := processor.GetMeta(ctx, p.URL) + meta, err := processor.GetMeta(ctx, p.URL, p.cache) if err != nil { p.Meta.Error = err.Error() } else { @@ -101,7 +103,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) { } }() - result := processor.Process(ctx, format, p.URL) + result := processor.Process(ctx, format, p.URL, p.cache) results.Add(result) }(format) }