Fix reduce network calls count for the target url

This commit is contained in:
2023-11-16 23:46:01 +03:00
parent e27fdabf78
commit 9912b7e436
7 changed files with 106 additions and 35 deletions

View File

@@ -17,7 +17,7 @@ type Headers struct {
client *http.Client
}
func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) {
func (h *Headers) Process(ctx context.Context, url string, _ *entity.Cache) ([]entity.File, error) {
var (
headersFile entity.File
err error

View File

@@ -19,7 +19,7 @@ type PDF struct {
cfg config.PDF
}
func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
gen, err := wkhtmltopdf.NewPDFGenerator()
if err != nil {
return nil, fmt.Errorf("new pdf generator: %w", err)
@@ -37,18 +37,29 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
gen.Grayscale.Set(p.cfg.Grayscale)
gen.Title.Set(url)
page := wkhtmltopdf.NewPage(url)
page.PrintMediaType.Set(p.cfg.MediaPrint)
page.JavascriptDelay.Set(200)
page.LoadErrorHandling.Set("ignore")
page.LoadMediaErrorHandling.Set("ignore")
page.FooterRight.Set("[page]")
page.HeaderLeft.Set(url)
page.HeaderRight.Set(time.Now().Format(time.DateOnly))
page.FooterFontSize.Set(10)
page.Zoom.Set(p.cfg.Zoom)
page.ViewportSize.Set(p.cfg.Viewport)
page.NoBackground.Set(true)
opts := wkhtmltopdf.NewPageOptions()
opts.PrintMediaType.Set(p.cfg.MediaPrint)
opts.JavascriptDelay.Set(200)
opts.DisableJavascript.Set(true)
opts.LoadErrorHandling.Set("ignore")
opts.LoadMediaErrorHandling.Set("ignore")
opts.FooterRight.Set("[opts]")
opts.HeaderLeft.Set(url)
opts.HeaderRight.Set(time.Now().Format(time.DateOnly))
opts.FooterFontSize.Set(10)
opts.Zoom.Set(p.cfg.Zoom)
opts.ViewportSize.Set(p.cfg.Viewport)
opts.NoBackground.Set(true)
opts.DisableLocalFileAccess.Set(true)
opts.DisableExternalLinks.Set(true)
opts.DisableInternalLinks.Set(true)
var page wkhtmltopdf.PageProvider
if len(cache.Get()) > 0 {
page = &wkhtmltopdf.PageReader{Input: cache.Reader(), PageOptions: opts}
} else {
page = &wkhtmltopdf.Page{Input: url, PageOptions: opts}
}
gen.AddPage(page)

View File

@@ -3,6 +3,7 @@ package processors
import (
"context"
"fmt"
"io"
"net"
"net/http"
"net/http/cookiejar"
@@ -18,7 +19,7 @@ import (
const defaultEncoding = "utf-8"
type processor interface {
Process(ctx context.Context, url string) ([]entity.File, error)
Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error)
}
func NewProcessors(cfg config.Config) (*Processors, error) {
@@ -73,7 +74,7 @@ type Processors struct {
client *http.Client
}
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
func (p *Processors) Process(ctx context.Context, format entity.Format, url string, cache *entity.Cache) entity.Result {
result := entity.Result{Format: format}
proc, ok := p.processors[format]
@@ -83,7 +84,7 @@ func (p *Processors) Process(ctx context.Context, format entity.Format, url stri
return result
}
files, err := proc.Process(ctx, url)
files, err := proc.Process(ctx, url, cache)
if err != nil {
result.Err = fmt.Errorf("process: %w", err)
@@ -101,7 +102,7 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err
return nil
}
func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) {
func (p *Processors) GetMeta(ctx context.Context, url string, cache *entity.Cache) (entity.Meta, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return entity.Meta{}, fmt.Errorf("new request: %w", err)
@@ -124,7 +125,9 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro
_ = response.Body.Close()
}()
htmlNode, err := html.Parse(response.Body)
tee := io.TeeReader(response.Body, cache)
htmlNode, err := html.Parse(tee)
if err != nil {
return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
}

View File

@@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/entity"
)
func TestProcessors_GetMeta(t *testing.T) {
@@ -20,7 +21,9 @@ func TestProcessors_GetMeta(t *testing.T) {
procs, err := NewProcessors(cfg)
require.NoError(t, err)
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/")
cache := entity.NewCache()
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/", cache)
require.NoError(t, err)
assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title)
}

View File

@@ -7,8 +7,9 @@ import (
"net/http"
"net/url"
"github.com/derfenix/webarchive/entity"
"golang.org/x/net/html"
"github.com/derfenix/webarchive/entity"
)
func NewSingleFile(client *http.Client) *SingleFile {
@@ -19,21 +20,30 @@ type SingleFile struct {
client *http.Client
}
func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, error) {
func (s *SingleFile) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
reader := cache.Reader()
if reader == nil {
response, err := s.get(ctx, url)
if err != nil {
return nil, err
}
htmlNode, err := html.Parse(response.Body)
if err != nil {
if response.Body != nil {
defer func() {
_ = response.Body.Close()
}()
}
reader = response.Body
}
htmlNode, err := html.Parse(reader)
if err != nil {
return nil, fmt.Errorf("parse response body: %w", err)
}
_ = response.Body.Close()
if err := s.process(ctx, htmlNode, url, response.Header); err != nil {
if err := s.process(ctx, htmlNode, url); err != nil {
return nil, fmt.Errorf("process: %w", err)
}
@@ -69,7 +79,7 @@ func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error
return response, nil
}
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string, headers http.Header) error {
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string) error {
parsedURL, err := url.Parse(pageURL)
if err != nil {
return fmt.Errorf("parse page url: %w", err)

42
entity/cache.go Normal file
View File

@@ -0,0 +1,42 @@
package entity
import (
"bytes"
"io"
"sync"
)
func NewCache() *Cache {
return &Cache{data: make([]byte, 0, 1024*512)}
}
type Cache struct {
mu sync.RWMutex
data []byte
}
func (c *Cache) Write(p []byte) (n int, err error) {
c.mu.Lock()
c.data = append(c.data, p...)
c.mu.Unlock()
return len(p), nil
}
func (c *Cache) Get() []byte {
c.mu.RLock()
defer c.mu.RUnlock()
return c.data
}
func (c *Cache) Reader() io.Reader {
c.mu.RLock()
defer c.mu.RUnlock()
if len(c.data) == 0 {
return nil
}
return bytes.NewBuffer(c.data)
}

View File

@@ -10,8 +10,8 @@ import (
)
type Processor interface {
Process(ctx context.Context, format Format, url string) Result
GetMeta(ctx context.Context, url string) (Meta, error)
Process(ctx context.Context, format Format, url string, cache *Cache) Result
GetMeta(ctx context.Context, url string, cache *Cache) (Meta, error)
}
type Format uint8
@@ -66,12 +66,14 @@ func NewPage(url string, description string, formats ...Format) *Page {
Created: time.Now(),
Version: 1,
},
cache: NewCache(),
}
}
type Page struct {
PageBase
Results ResultsRO
cache *Cache
}
func (p *Page) SetProcessing() {
@@ -82,7 +84,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
innerWG := sync.WaitGroup{}
innerWG.Add(len(p.Formats))
meta, err := processor.GetMeta(ctx, p.URL)
meta, err := processor.GetMeta(ctx, p.URL, p.cache)
if err != nil {
p.Meta.Error = err.Error()
} else {
@@ -101,7 +103,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
}
}()
result := processor.Process(ctx, format, p.URL)
result := processor.Process(ctx, format, p.URL, p.cache)
results.Add(result)
}(format)
}