Fix reduce network calls count for the target url

This commit is contained in:
2023-11-16 23:46:01 +03:00
parent e27fdabf78
commit 9912b7e436
7 changed files with 106 additions and 35 deletions

View File

@@ -17,7 +17,7 @@ type Headers struct {
client *http.Client client *http.Client
} }
func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) { func (h *Headers) Process(ctx context.Context, url string, _ *entity.Cache) ([]entity.File, error) {
var ( var (
headersFile entity.File headersFile entity.File
err error err error

View File

@@ -19,7 +19,7 @@ type PDF struct {
cfg config.PDF cfg config.PDF
} }
func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) { func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
gen, err := wkhtmltopdf.NewPDFGenerator() gen, err := wkhtmltopdf.NewPDFGenerator()
if err != nil { if err != nil {
return nil, fmt.Errorf("new pdf generator: %w", err) return nil, fmt.Errorf("new pdf generator: %w", err)
@@ -37,18 +37,29 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
gen.Grayscale.Set(p.cfg.Grayscale) gen.Grayscale.Set(p.cfg.Grayscale)
gen.Title.Set(url) gen.Title.Set(url)
page := wkhtmltopdf.NewPage(url) opts := wkhtmltopdf.NewPageOptions()
page.PrintMediaType.Set(p.cfg.MediaPrint) opts.PrintMediaType.Set(p.cfg.MediaPrint)
page.JavascriptDelay.Set(200) opts.JavascriptDelay.Set(200)
page.LoadErrorHandling.Set("ignore") opts.DisableJavascript.Set(true)
page.LoadMediaErrorHandling.Set("ignore") opts.LoadErrorHandling.Set("ignore")
page.FooterRight.Set("[page]") opts.LoadMediaErrorHandling.Set("ignore")
page.HeaderLeft.Set(url) opts.FooterRight.Set("[opts]")
page.HeaderRight.Set(time.Now().Format(time.DateOnly)) opts.HeaderLeft.Set(url)
page.FooterFontSize.Set(10) opts.HeaderRight.Set(time.Now().Format(time.DateOnly))
page.Zoom.Set(p.cfg.Zoom) opts.FooterFontSize.Set(10)
page.ViewportSize.Set(p.cfg.Viewport) opts.Zoom.Set(p.cfg.Zoom)
page.NoBackground.Set(true) opts.ViewportSize.Set(p.cfg.Viewport)
opts.NoBackground.Set(true)
opts.DisableLocalFileAccess.Set(true)
opts.DisableExternalLinks.Set(true)
opts.DisableInternalLinks.Set(true)
var page wkhtmltopdf.PageProvider
if len(cache.Get()) > 0 {
page = &wkhtmltopdf.PageReader{Input: cache.Reader(), PageOptions: opts}
} else {
page = &wkhtmltopdf.Page{Input: url, PageOptions: opts}
}
gen.AddPage(page) gen.AddPage(page)

View File

@@ -3,6 +3,7 @@ package processors
import ( import (
"context" "context"
"fmt" "fmt"
"io"
"net" "net"
"net/http" "net/http"
"net/http/cookiejar" "net/http/cookiejar"
@@ -18,7 +19,7 @@ import (
const defaultEncoding = "utf-8" const defaultEncoding = "utf-8"
type processor interface { type processor interface {
Process(ctx context.Context, url string) ([]entity.File, error) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error)
} }
func NewProcessors(cfg config.Config) (*Processors, error) { func NewProcessors(cfg config.Config) (*Processors, error) {
@@ -73,7 +74,7 @@ type Processors struct {
client *http.Client client *http.Client
} }
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result { func (p *Processors) Process(ctx context.Context, format entity.Format, url string, cache *entity.Cache) entity.Result {
result := entity.Result{Format: format} result := entity.Result{Format: format}
proc, ok := p.processors[format] proc, ok := p.processors[format]
@@ -83,7 +84,7 @@ func (p *Processors) Process(ctx context.Context, format entity.Format, url stri
return result return result
} }
files, err := proc.Process(ctx, url) files, err := proc.Process(ctx, url, cache)
if err != nil { if err != nil {
result.Err = fmt.Errorf("process: %w", err) result.Err = fmt.Errorf("process: %w", err)
@@ -101,7 +102,7 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err
return nil return nil
} }
func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) { func (p *Processors) GetMeta(ctx context.Context, url string, cache *entity.Cache) (entity.Meta, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil { if err != nil {
return entity.Meta{}, fmt.Errorf("new request: %w", err) return entity.Meta{}, fmt.Errorf("new request: %w", err)
@@ -124,7 +125,9 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro
_ = response.Body.Close() _ = response.Body.Close()
}() }()
htmlNode, err := html.Parse(response.Body) tee := io.TeeReader(response.Body, cache)
htmlNode, err := html.Parse(tee)
if err != nil { if err != nil {
return entity.Meta{}, fmt.Errorf("parse response body: %w", err) return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
} }

View File

@@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/derfenix/webarchive/config" "github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/entity"
) )
func TestProcessors_GetMeta(t *testing.T) { func TestProcessors_GetMeta(t *testing.T) {
@@ -20,7 +21,9 @@ func TestProcessors_GetMeta(t *testing.T) {
procs, err := NewProcessors(cfg) procs, err := NewProcessors(cfg)
require.NoError(t, err) require.NoError(t, err)
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/") cache := entity.NewCache()
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/", cache)
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title) assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title)
} }

View File

@@ -7,8 +7,9 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"github.com/derfenix/webarchive/entity"
"golang.org/x/net/html" "golang.org/x/net/html"
"github.com/derfenix/webarchive/entity"
) )
func NewSingleFile(client *http.Client) *SingleFile { func NewSingleFile(client *http.Client) *SingleFile {
@@ -19,21 +20,30 @@ type SingleFile struct {
client *http.Client client *http.Client
} }
func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, error) { func (s *SingleFile) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
response, err := s.get(ctx, url) reader := cache.Reader()
if err != nil {
return nil, err if reader == nil {
response, err := s.get(ctx, url)
if err != nil {
return nil, err
}
if response.Body != nil {
defer func() {
_ = response.Body.Close()
}()
}
reader = response.Body
} }
htmlNode, err := html.Parse(response.Body) htmlNode, err := html.Parse(reader)
if err != nil { if err != nil {
_ = response.Body.Close()
return nil, fmt.Errorf("parse response body: %w", err) return nil, fmt.Errorf("parse response body: %w", err)
} }
_ = response.Body.Close() if err := s.process(ctx, htmlNode, url); err != nil {
if err := s.process(ctx, htmlNode, url, response.Header); err != nil {
return nil, fmt.Errorf("process: %w", err) return nil, fmt.Errorf("process: %w", err)
} }
@@ -69,7 +79,7 @@ func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error
return response, nil return response, nil
} }
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string, headers http.Header) error { func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string) error {
parsedURL, err := url.Parse(pageURL) parsedURL, err := url.Parse(pageURL)
if err != nil { if err != nil {
return fmt.Errorf("parse page url: %w", err) return fmt.Errorf("parse page url: %w", err)

42
entity/cache.go Normal file
View File

@@ -0,0 +1,42 @@
package entity
import (
"bytes"
"io"
"sync"
)
func NewCache() *Cache {
return &Cache{data: make([]byte, 0, 1024*512)}
}
type Cache struct {
mu sync.RWMutex
data []byte
}
func (c *Cache) Write(p []byte) (n int, err error) {
c.mu.Lock()
c.data = append(c.data, p...)
c.mu.Unlock()
return len(p), nil
}
func (c *Cache) Get() []byte {
c.mu.RLock()
defer c.mu.RUnlock()
return c.data
}
func (c *Cache) Reader() io.Reader {
c.mu.RLock()
defer c.mu.RUnlock()
if len(c.data) == 0 {
return nil
}
return bytes.NewBuffer(c.data)
}

View File

@@ -10,8 +10,8 @@ import (
) )
type Processor interface { type Processor interface {
Process(ctx context.Context, format Format, url string) Result Process(ctx context.Context, format Format, url string, cache *Cache) Result
GetMeta(ctx context.Context, url string) (Meta, error) GetMeta(ctx context.Context, url string, cache *Cache) (Meta, error)
} }
type Format uint8 type Format uint8
@@ -66,12 +66,14 @@ func NewPage(url string, description string, formats ...Format) *Page {
Created: time.Now(), Created: time.Now(),
Version: 1, Version: 1,
}, },
cache: NewCache(),
} }
} }
type Page struct { type Page struct {
PageBase PageBase
Results ResultsRO Results ResultsRO
cache *Cache
} }
func (p *Page) SetProcessing() { func (p *Page) SetProcessing() {
@@ -82,7 +84,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
innerWG := sync.WaitGroup{} innerWG := sync.WaitGroup{}
innerWG.Add(len(p.Formats)) innerWG.Add(len(p.Formats))
meta, err := processor.GetMeta(ctx, p.URL) meta, err := processor.GetMeta(ctx, p.URL, p.cache)
if err != nil { if err != nil {
p.Meta.Error = err.Error() p.Meta.Error = err.Error()
} else { } else {
@@ -101,7 +103,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
} }
}() }()
result := processor.Process(ctx, format, p.URL) result := processor.Process(ctx, format, p.URL, p.cache)
results.Add(result) results.Add(result)
}(format) }(format)
} }