mirror of
https://github.com/derfenix/webarchive.git
synced 2026-03-11 12:41:54 +03:00
Fix reduce network calls count for the target url
This commit is contained in:
@@ -17,7 +17,7 @@ type Headers struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) {
|
||||
func (h *Headers) Process(ctx context.Context, url string, _ *entity.Cache) ([]entity.File, error) {
|
||||
var (
|
||||
headersFile entity.File
|
||||
err error
|
||||
|
||||
@@ -19,7 +19,7 @@ type PDF struct {
|
||||
cfg config.PDF
|
||||
}
|
||||
|
||||
func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
|
||||
func (p *PDF) Process(_ context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
|
||||
gen, err := wkhtmltopdf.NewPDFGenerator()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new pdf generator: %w", err)
|
||||
@@ -37,18 +37,29 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
|
||||
gen.Grayscale.Set(p.cfg.Grayscale)
|
||||
gen.Title.Set(url)
|
||||
|
||||
page := wkhtmltopdf.NewPage(url)
|
||||
page.PrintMediaType.Set(p.cfg.MediaPrint)
|
||||
page.JavascriptDelay.Set(200)
|
||||
page.LoadErrorHandling.Set("ignore")
|
||||
page.LoadMediaErrorHandling.Set("ignore")
|
||||
page.FooterRight.Set("[page]")
|
||||
page.HeaderLeft.Set(url)
|
||||
page.HeaderRight.Set(time.Now().Format(time.DateOnly))
|
||||
page.FooterFontSize.Set(10)
|
||||
page.Zoom.Set(p.cfg.Zoom)
|
||||
page.ViewportSize.Set(p.cfg.Viewport)
|
||||
page.NoBackground.Set(true)
|
||||
opts := wkhtmltopdf.NewPageOptions()
|
||||
opts.PrintMediaType.Set(p.cfg.MediaPrint)
|
||||
opts.JavascriptDelay.Set(200)
|
||||
opts.DisableJavascript.Set(true)
|
||||
opts.LoadErrorHandling.Set("ignore")
|
||||
opts.LoadMediaErrorHandling.Set("ignore")
|
||||
opts.FooterRight.Set("[opts]")
|
||||
opts.HeaderLeft.Set(url)
|
||||
opts.HeaderRight.Set(time.Now().Format(time.DateOnly))
|
||||
opts.FooterFontSize.Set(10)
|
||||
opts.Zoom.Set(p.cfg.Zoom)
|
||||
opts.ViewportSize.Set(p.cfg.Viewport)
|
||||
opts.NoBackground.Set(true)
|
||||
opts.DisableLocalFileAccess.Set(true)
|
||||
opts.DisableExternalLinks.Set(true)
|
||||
opts.DisableInternalLinks.Set(true)
|
||||
|
||||
var page wkhtmltopdf.PageProvider
|
||||
if len(cache.Get()) > 0 {
|
||||
page = &wkhtmltopdf.PageReader{Input: cache.Reader(), PageOptions: opts}
|
||||
} else {
|
||||
page = &wkhtmltopdf.Page{Input: url, PageOptions: opts}
|
||||
}
|
||||
|
||||
gen.AddPage(page)
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ package processors
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
@@ -18,7 +19,7 @@ import (
|
||||
const defaultEncoding = "utf-8"
|
||||
|
||||
type processor interface {
|
||||
Process(ctx context.Context, url string) ([]entity.File, error)
|
||||
Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error)
|
||||
}
|
||||
|
||||
func NewProcessors(cfg config.Config) (*Processors, error) {
|
||||
@@ -73,7 +74,7 @@ type Processors struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
|
||||
func (p *Processors) Process(ctx context.Context, format entity.Format, url string, cache *entity.Cache) entity.Result {
|
||||
result := entity.Result{Format: format}
|
||||
|
||||
proc, ok := p.processors[format]
|
||||
@@ -83,7 +84,7 @@ func (p *Processors) Process(ctx context.Context, format entity.Format, url stri
|
||||
return result
|
||||
}
|
||||
|
||||
files, err := proc.Process(ctx, url)
|
||||
files, err := proc.Process(ctx, url, cache)
|
||||
if err != nil {
|
||||
result.Err = fmt.Errorf("process: %w", err)
|
||||
|
||||
@@ -101,7 +102,7 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) {
|
||||
func (p *Processors) GetMeta(ctx context.Context, url string, cache *entity.Cache) (entity.Meta, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return entity.Meta{}, fmt.Errorf("new request: %w", err)
|
||||
@@ -124,7 +125,9 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro
|
||||
_ = response.Body.Close()
|
||||
}()
|
||||
|
||||
htmlNode, err := html.Parse(response.Body)
|
||||
tee := io.TeeReader(response.Body, cache)
|
||||
|
||||
htmlNode, err := html.Parse(tee)
|
||||
if err != nil {
|
||||
return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/derfenix/webarchive/config"
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func TestProcessors_GetMeta(t *testing.T) {
|
||||
@@ -20,7 +21,9 @@ func TestProcessors_GetMeta(t *testing.T) {
|
||||
procs, err := NewProcessors(cfg)
|
||||
require.NoError(t, err)
|
||||
|
||||
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/")
|
||||
cache := entity.NewCache()
|
||||
|
||||
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/", cache)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title)
|
||||
}
|
||||
|
||||
@@ -7,8 +7,9 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func NewSingleFile(client *http.Client) *SingleFile {
|
||||
@@ -19,21 +20,30 @@ type SingleFile struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, error) {
|
||||
func (s *SingleFile) Process(ctx context.Context, url string, cache *entity.Cache) ([]entity.File, error) {
|
||||
reader := cache.Reader()
|
||||
|
||||
if reader == nil {
|
||||
response, err := s.get(ctx, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
htmlNode, err := html.Parse(response.Body)
|
||||
if err != nil {
|
||||
if response.Body != nil {
|
||||
defer func() {
|
||||
_ = response.Body.Close()
|
||||
}()
|
||||
}
|
||||
|
||||
reader = response.Body
|
||||
}
|
||||
|
||||
htmlNode, err := html.Parse(reader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse response body: %w", err)
|
||||
}
|
||||
|
||||
_ = response.Body.Close()
|
||||
|
||||
if err := s.process(ctx, htmlNode, url, response.Header); err != nil {
|
||||
if err := s.process(ctx, htmlNode, url); err != nil {
|
||||
return nil, fmt.Errorf("process: %w", err)
|
||||
}
|
||||
|
||||
@@ -69,7 +79,7 @@ func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string, headers http.Header) error {
|
||||
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string) error {
|
||||
parsedURL, err := url.Parse(pageURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse page url: %w", err)
|
||||
|
||||
42
entity/cache.go
Normal file
42
entity/cache.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package entity
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"sync"
|
||||
)
|
||||
|
||||
func NewCache() *Cache {
|
||||
return &Cache{data: make([]byte, 0, 1024*512)}
|
||||
}
|
||||
|
||||
type Cache struct {
|
||||
mu sync.RWMutex
|
||||
data []byte
|
||||
}
|
||||
|
||||
func (c *Cache) Write(p []byte) (n int, err error) {
|
||||
c.mu.Lock()
|
||||
c.data = append(c.data, p...)
|
||||
c.mu.Unlock()
|
||||
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (c *Cache) Get() []byte {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
return c.data
|
||||
}
|
||||
|
||||
func (c *Cache) Reader() io.Reader {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
if len(c.data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return bytes.NewBuffer(c.data)
|
||||
}
|
||||
@@ -10,8 +10,8 @@ import (
|
||||
)
|
||||
|
||||
type Processor interface {
|
||||
Process(ctx context.Context, format Format, url string) Result
|
||||
GetMeta(ctx context.Context, url string) (Meta, error)
|
||||
Process(ctx context.Context, format Format, url string, cache *Cache) Result
|
||||
GetMeta(ctx context.Context, url string, cache *Cache) (Meta, error)
|
||||
}
|
||||
|
||||
type Format uint8
|
||||
@@ -66,12 +66,14 @@ func NewPage(url string, description string, formats ...Format) *Page {
|
||||
Created: time.Now(),
|
||||
Version: 1,
|
||||
},
|
||||
cache: NewCache(),
|
||||
}
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
PageBase
|
||||
Results ResultsRO
|
||||
cache *Cache
|
||||
}
|
||||
|
||||
func (p *Page) SetProcessing() {
|
||||
@@ -82,7 +84,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
|
||||
innerWG := sync.WaitGroup{}
|
||||
innerWG.Add(len(p.Formats))
|
||||
|
||||
meta, err := processor.GetMeta(ctx, p.URL)
|
||||
meta, err := processor.GetMeta(ctx, p.URL, p.cache)
|
||||
if err != nil {
|
||||
p.Meta.Error = err.Error()
|
||||
} else {
|
||||
@@ -101,7 +103,7 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
|
||||
}
|
||||
}()
|
||||
|
||||
result := processor.Process(ctx, format, p.URL)
|
||||
result := processor.Process(ctx, format, p.URL, p.cache)
|
||||
results.Add(result)
|
||||
}(format)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user