Initial commit

This commit is contained in:
2023-03-26 16:11:00 +03:00
commit 92469fa3a2
47 changed files with 5610 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
package processors
import (
"bytes"
"context"
"fmt"
"net/http"
"github.com/derfenix/webarchive/entity"
)
func NewHeaders(client *http.Client) *Headers {
return &Headers{client: client}
}
type Headers struct {
client *http.Client
}
func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) {
var (
headersFile entity.File
err error
)
req, reqErr := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
if reqErr != nil {
return nil, fmt.Errorf("create request: %w", reqErr)
}
resp, doErr := h.client.Do(req)
if doErr != nil {
return nil, fmt.Errorf("call url: %w", doErr)
}
headersFile, err = h.newFile(resp.Header)
if err != nil {
return nil, fmt.Errorf("new file from headers: %w", err)
}
return []entity.File{headersFile}, nil
}
func (h *Headers) newFile(headers http.Header) (entity.File, error) {
buf := bytes.NewBuffer(nil)
if err := headers.Write(buf); err != nil {
return entity.File{}, fmt.Errorf("write headers: %w", err)
}
return entity.NewFile("headers", buf.Bytes()), nil
}

View File

@@ -0,0 +1,52 @@
package processors
import (
"context"
"fmt"
"time"
"github.com/SebastiaanKlippert/go-wkhtmltopdf"
"github.com/derfenix/webarchive/entity"
)
func NewPDF() *PDF {
return &PDF{}
}
type PDF struct{}
func (P *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
gen, err := wkhtmltopdf.NewPDFGenerator()
if err != nil {
return nil, fmt.Errorf("new pdf generator: %w", err)
}
gen.Dpi.Set(300)
gen.PageSize.Set(wkhtmltopdf.PageSizeA4)
gen.Orientation.Set(wkhtmltopdf.OrientationPortrait)
gen.Grayscale.Set(false)
gen.Title.Set(url)
page := wkhtmltopdf.NewPage(url)
page.JavascriptDelay.Set(200)
page.LoadMediaErrorHandling.Set("abort")
page.FooterRight.Set("[page]")
page.HeaderLeft.Set(url)
page.HeaderRight.Set(time.Now().Format(time.DateOnly))
page.FooterFontSize.Set(10)
page.Zoom.Set(1)
page.ViewportSize.Set("1920x1080")
gen.AddPage(page)
// Create PDF document in internal buffer
err = gen.Create()
if err != nil {
return nil, fmt.Errorf("create pdf: %w", err)
}
file := entity.NewFile("page.pdf", gen.Bytes())
return []entity.File{file}, nil
}

View File

@@ -0,0 +1,29 @@
package processors
import (
"context"
"fmt"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestPDF_Process(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skip test with external resource")
}
files, err := (&PDF{}).Process(context.Background(), "https://github.com/SebastiaanKlippert/go-wkhtmltopdf")
require.NoError(t, err)
require.Len(t, files, 1)
f := files[0]
fmt.Println("ID ", f.ID)
fmt.Println("Name ", f.Name)
fmt.Println("MimeType ", f.MimeType)
fmt.Println("Size ", f.Size)
fmt.Println("Created ", f.Created.Format(time.RFC3339))
}

View File

@@ -0,0 +1,93 @@
package processors
import (
"context"
"fmt"
"net"
"net/http"
"net/http/cookiejar"
"time"
"github.com/derfenix/webarchive/entity"
)
type processor interface {
Process(ctx context.Context, url string) ([]entity.File, error)
}
func NewProcessors() (*Processors, error) {
jar, err := cookiejar.New(&cookiejar.Options{
PublicSuffixList: nil,
})
if err != nil {
return nil, fmt.Errorf("create cookie jar: %w", err)
}
httpClient := &http.Client{
Transport: &http.Transport{
DialContext: (&net.Dialer{
Timeout: time.Second * 10,
KeepAlive: time.Second * 10,
}).DialContext,
MaxIdleConns: 20,
MaxIdleConnsPerHost: 5,
MaxConnsPerHost: 10,
IdleConnTimeout: time.Second * 60,
ResponseHeaderTimeout: time.Second * 20,
MaxResponseHeaderBytes: 1024 * 1024 * 50,
WriteBufferSize: 256,
ReadBufferSize: 1024 * 64,
ForceAttemptHTTP2: true,
},
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) > 3 {
return fmt.Errorf("too many redirects")
}
return nil
},
Jar: jar,
Timeout: time.Second * 30,
}
procs := Processors{
processors: map[entity.Format]processor{
entity.FormatHeaders: NewHeaders(httpClient),
entity.FormatPDF: NewPDF(),
},
}
return &procs, nil
}
type Processors struct {
processors map[entity.Format]processor
}
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
result := entity.Result{Format: format}
proc, ok := p.processors[format]
if !ok {
result.Err = fmt.Errorf("no processor registered for format %v", format)
return result
}
files, err := proc.Process(ctx, url)
if err != nil {
result.Err = fmt.Errorf("process: %w", err)
return result
}
result.Files = files
return result
}
func (p *Processors) Override(format entity.Format, proc processor) error {
p.processors[format] = proc
return nil
}

Binary file not shown.