mirror of
https://github.com/derfenix/webarchive.git
synced 2026-03-11 12:41:54 +03:00
Initial commit
This commit is contained in:
53
adapters/processors/headers.go
Normal file
53
adapters/processors/headers.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package processors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func NewHeaders(client *http.Client) *Headers {
|
||||
return &Headers{client: client}
|
||||
}
|
||||
|
||||
type Headers struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (h *Headers) Process(ctx context.Context, url string) ([]entity.File, error) {
|
||||
var (
|
||||
headersFile entity.File
|
||||
err error
|
||||
)
|
||||
|
||||
req, reqErr := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
|
||||
if reqErr != nil {
|
||||
return nil, fmt.Errorf("create request: %w", reqErr)
|
||||
}
|
||||
|
||||
resp, doErr := h.client.Do(req)
|
||||
if doErr != nil {
|
||||
return nil, fmt.Errorf("call url: %w", doErr)
|
||||
}
|
||||
|
||||
headersFile, err = h.newFile(resp.Header)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new file from headers: %w", err)
|
||||
}
|
||||
|
||||
return []entity.File{headersFile}, nil
|
||||
}
|
||||
|
||||
func (h *Headers) newFile(headers http.Header) (entity.File, error) {
|
||||
buf := bytes.NewBuffer(nil)
|
||||
|
||||
if err := headers.Write(buf); err != nil {
|
||||
return entity.File{}, fmt.Errorf("write headers: %w", err)
|
||||
}
|
||||
|
||||
return entity.NewFile("headers", buf.Bytes()), nil
|
||||
}
|
||||
52
adapters/processors/pdf.go
Normal file
52
adapters/processors/pdf.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package processors
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/SebastiaanKlippert/go-wkhtmltopdf"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func NewPDF() *PDF {
|
||||
return &PDF{}
|
||||
}
|
||||
|
||||
type PDF struct{}
|
||||
|
||||
func (P *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
|
||||
gen, err := wkhtmltopdf.NewPDFGenerator()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new pdf generator: %w", err)
|
||||
}
|
||||
|
||||
gen.Dpi.Set(300)
|
||||
gen.PageSize.Set(wkhtmltopdf.PageSizeA4)
|
||||
gen.Orientation.Set(wkhtmltopdf.OrientationPortrait)
|
||||
gen.Grayscale.Set(false)
|
||||
gen.Title.Set(url)
|
||||
|
||||
page := wkhtmltopdf.NewPage(url)
|
||||
page.JavascriptDelay.Set(200)
|
||||
page.LoadMediaErrorHandling.Set("abort")
|
||||
page.FooterRight.Set("[page]")
|
||||
page.HeaderLeft.Set(url)
|
||||
page.HeaderRight.Set(time.Now().Format(time.DateOnly))
|
||||
page.FooterFontSize.Set(10)
|
||||
page.Zoom.Set(1)
|
||||
page.ViewportSize.Set("1920x1080")
|
||||
|
||||
gen.AddPage(page)
|
||||
|
||||
// Create PDF document in internal buffer
|
||||
err = gen.Create()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create pdf: %w", err)
|
||||
}
|
||||
|
||||
file := entity.NewFile("page.pdf", gen.Bytes())
|
||||
|
||||
return []entity.File{file}, nil
|
||||
}
|
||||
29
adapters/processors/pdf_test.go
Normal file
29
adapters/processors/pdf_test.go
Normal file
@@ -0,0 +1,29 @@
|
||||
package processors
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestPDF_Process(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if testing.Short() {
|
||||
t.Skip("skip test with external resource")
|
||||
}
|
||||
|
||||
files, err := (&PDF{}).Process(context.Background(), "https://github.com/SebastiaanKlippert/go-wkhtmltopdf")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, files, 1)
|
||||
|
||||
f := files[0]
|
||||
fmt.Println("ID ", f.ID)
|
||||
fmt.Println("Name ", f.Name)
|
||||
fmt.Println("MimeType ", f.MimeType)
|
||||
fmt.Println("Size ", f.Size)
|
||||
fmt.Println("Created ", f.Created.Format(time.RFC3339))
|
||||
}
|
||||
93
adapters/processors/processors.go
Normal file
93
adapters/processors/processors.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package processors
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"time"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
type processor interface {
|
||||
Process(ctx context.Context, url string) ([]entity.File, error)
|
||||
}
|
||||
|
||||
func NewProcessors() (*Processors, error) {
|
||||
jar, err := cookiejar.New(&cookiejar.Options{
|
||||
PublicSuffixList: nil,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create cookie jar: %w", err)
|
||||
}
|
||||
|
||||
httpClient := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: time.Second * 10,
|
||||
KeepAlive: time.Second * 10,
|
||||
}).DialContext,
|
||||
MaxIdleConns: 20,
|
||||
MaxIdleConnsPerHost: 5,
|
||||
MaxConnsPerHost: 10,
|
||||
IdleConnTimeout: time.Second * 60,
|
||||
ResponseHeaderTimeout: time.Second * 20,
|
||||
MaxResponseHeaderBytes: 1024 * 1024 * 50,
|
||||
WriteBufferSize: 256,
|
||||
ReadBufferSize: 1024 * 64,
|
||||
ForceAttemptHTTP2: true,
|
||||
},
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) > 3 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
Jar: jar,
|
||||
Timeout: time.Second * 30,
|
||||
}
|
||||
|
||||
procs := Processors{
|
||||
processors: map[entity.Format]processor{
|
||||
entity.FormatHeaders: NewHeaders(httpClient),
|
||||
entity.FormatPDF: NewPDF(),
|
||||
},
|
||||
}
|
||||
|
||||
return &procs, nil
|
||||
}
|
||||
|
||||
type Processors struct {
|
||||
processors map[entity.Format]processor
|
||||
}
|
||||
|
||||
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
|
||||
result := entity.Result{Format: format}
|
||||
|
||||
proc, ok := p.processors[format]
|
||||
if !ok {
|
||||
result.Err = fmt.Errorf("no processor registered for format %v", format)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
files, err := proc.Process(ctx, url)
|
||||
if err != nil {
|
||||
result.Err = fmt.Errorf("process: %w", err)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
result.Files = files
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (p *Processors) Override(format entity.Format, proc processor) error {
|
||||
p.processors[format] = proc
|
||||
|
||||
return nil
|
||||
}
|
||||
BIN
adapters/processors/simplesample.pdf
Normal file
BIN
adapters/processors/simplesample.pdf
Normal file
Binary file not shown.
128
adapters/repository/badger/db.go
Normal file
128
adapters/repository/badger/db.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package badger
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
|
||||
"github.com/dgraph-io/badger/v4"
|
||||
"github.com/dgraph-io/badger/v4/options"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const (
|
||||
backupStartPath = "backup_start.db"
|
||||
backupStopPath = "backup_stop.db"
|
||||
)
|
||||
|
||||
type BackupType uint8
|
||||
|
||||
const (
|
||||
BackupStart BackupType = iota
|
||||
BackupStop
|
||||
)
|
||||
|
||||
var ErrDBClosed = fmt.Errorf("database is closed")
|
||||
|
||||
type logger struct {
|
||||
*zap.SugaredLogger
|
||||
}
|
||||
|
||||
func (l *logger) Warningf(s string, i ...interface{}) {
|
||||
l.SugaredLogger.Warnf(s, i...)
|
||||
}
|
||||
|
||||
func NewBadger(dir string, log *zap.Logger) (*badger.DB, error) {
|
||||
opts := badger.DefaultOptions(dir)
|
||||
opts.Logger = &logger{SugaredLogger: log.Sugar()}
|
||||
opts.Compression = options.ZSTD
|
||||
opts.ZSTDCompressionLevel = 6
|
||||
|
||||
db, err := badger.Open(opts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open database: %w", err)
|
||||
}
|
||||
|
||||
if err := Backup(db, BackupStart); err != nil {
|
||||
log.Error("backup on start failed", zap.Error(err))
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
func Backup(db *badger.DB, bt BackupType) error {
|
||||
dir := db.Opts().Dir
|
||||
var backupPath string
|
||||
|
||||
switch bt {
|
||||
case BackupStart:
|
||||
backupPath = path.Join(dir, backupStartPath)
|
||||
case BackupStop:
|
||||
backupPath = path.Join(dir, backupStopPath)
|
||||
}
|
||||
|
||||
file, err := os.OpenFile(backupPath, os.O_CREATE|os.O_WRONLY, os.FileMode(0600))
|
||||
if err != nil {
|
||||
return fmt.Errorf("open backup file %s: %w", backupPath, err)
|
||||
}
|
||||
defer func() {
|
||||
_ = file.Close()
|
||||
}()
|
||||
|
||||
_, err = db.Backup(file, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("backup: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Restore(db *badger.DB) error {
|
||||
dir := db.Opts().Dir
|
||||
|
||||
backupPathStart := path.Join(dir, backupStartPath)
|
||||
backupPathStop := path.Join(dir, backupStopPath)
|
||||
|
||||
startStat, err := os.Stat(backupPathStart)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return fmt.Errorf("stat file %s: %w", backupPathStart, err)
|
||||
}
|
||||
|
||||
stopStat, err := os.Stat(backupPathStop)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return fmt.Errorf("stat file %s: %w", backupPathStop, err)
|
||||
}
|
||||
|
||||
var backupFile string
|
||||
|
||||
switch {
|
||||
case stopStat != nil && startStat != nil:
|
||||
if stopStat.ModTime().After(startStat.ModTime()) {
|
||||
backupFile = backupPathStop
|
||||
} else {
|
||||
backupFile = backupPathStart
|
||||
}
|
||||
|
||||
case stopStat != nil:
|
||||
backupFile = backupPathStart
|
||||
|
||||
case startStat != nil:
|
||||
backupFile = backupPathStop
|
||||
}
|
||||
|
||||
file, err := os.OpenFile(backupFile, os.O_RDONLY, os.FileMode(0600))
|
||||
if err != nil {
|
||||
return fmt.Errorf("open backup file %s: %w", backupFile, err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
_ = file.Close()
|
||||
}()
|
||||
|
||||
if err := db.Load(file, 20); err != nil {
|
||||
return fmt.Errorf("load backup: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
40
adapters/repository/badger/file.go
Normal file
40
adapters/repository/badger/file.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package badger
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/dgraph-io/badger/v4"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func NewFile(db *badger.DB) *File {
|
||||
return &File{db: db, prefix: []byte("file:")}
|
||||
}
|
||||
|
||||
type File struct {
|
||||
db *badger.DB
|
||||
prefix []byte
|
||||
}
|
||||
|
||||
func (f *File) SaveTx(_ context.Context, txn *badger.Txn, file *entity.File) error {
|
||||
if f.db.IsClosed() {
|
||||
return ErrDBClosed
|
||||
}
|
||||
|
||||
marshaled, err := marshal(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal data: %w", err)
|
||||
}
|
||||
|
||||
if err := txn.Set(f.key(file), marshaled); err != nil {
|
||||
return fmt.Errorf("put data: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *File) key(file *entity.File) []byte {
|
||||
return append(f.prefix, []byte(file.ID.String())...)
|
||||
}
|
||||
13
adapters/repository/badger/marshal.go
Normal file
13
adapters/repository/badger/marshal.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package badger
|
||||
|
||||
import (
|
||||
"github.com/vmihailenco/msgpack/v5"
|
||||
)
|
||||
|
||||
func marshal(v interface{}) ([]byte, error) {
|
||||
return msgpack.Marshal(v)
|
||||
}
|
||||
|
||||
func unmarshal(b []byte, v interface{}) error {
|
||||
return msgpack.Unmarshal(b, v)
|
||||
}
|
||||
142
adapters/repository/badger/page.go
Normal file
142
adapters/repository/badger/page.go
Normal file
@@ -0,0 +1,142 @@
|
||||
package badger
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/dgraph-io/badger/v4"
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func NewPage(db *badger.DB, file *File) (*Page, error) {
|
||||
return &Page{
|
||||
db: db,
|
||||
prefix: []byte("page:"),
|
||||
file: file,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
db *badger.DB
|
||||
prefix []byte
|
||||
file *File
|
||||
}
|
||||
|
||||
func (p *Page) Save(ctx context.Context, site *entity.Page) error {
|
||||
if p.db.IsClosed() {
|
||||
return ErrDBClosed
|
||||
}
|
||||
|
||||
marshaled, err := marshal(site)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal data: %w", err)
|
||||
}
|
||||
|
||||
if err := p.db.Update(func(txn *badger.Txn) error {
|
||||
if err := txn.Set(p.key(site), marshaled); err != nil {
|
||||
return fmt.Errorf("put data: %w", err)
|
||||
}
|
||||
|
||||
for i, result := range site.Results.Results() {
|
||||
for j, file := range result.Files {
|
||||
if err := p.file.SaveTx(ctx, txn, &file); err != nil {
|
||||
return fmt.Errorf("save file %d (%s) for result %d: %w", j, file.ID.String(), i, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}); err != nil {
|
||||
return fmt.Errorf("update db: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Page) Get(_ context.Context, id uuid.UUID) (*entity.Page, error) {
|
||||
site := entity.Page{ID: id}
|
||||
|
||||
err := p.db.View(func(txn *badger.Txn) error {
|
||||
data, err := txn.Get(p.key(&site))
|
||||
if err != nil {
|
||||
return fmt.Errorf("get data: %w", err)
|
||||
}
|
||||
|
||||
err = data.Value(func(val []byte) error {
|
||||
if err := unmarshal(val, &site); err != nil {
|
||||
return fmt.Errorf("unmarshal data: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("get value: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("view: %w", err)
|
||||
}
|
||||
|
||||
return &site, nil
|
||||
}
|
||||
|
||||
func (p *Page) ListAll(ctx context.Context) ([]*entity.Page, error) {
|
||||
pages := make([]*entity.Page, 0, 100)
|
||||
|
||||
err := p.db.View(func(txn *badger.Txn) error {
|
||||
iterator := txn.NewIterator(badger.DefaultIteratorOptions)
|
||||
|
||||
defer iterator.Close()
|
||||
|
||||
for iterator.Seek(p.prefix); iterator.ValidForPrefix(p.prefix); iterator.Next() {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return fmt.Errorf("context canceled: %w", err)
|
||||
}
|
||||
|
||||
var page entity.Page
|
||||
|
||||
err := iterator.Item().Value(func(val []byte) error {
|
||||
if err := unmarshal(val, &page); err != nil {
|
||||
return fmt.Errorf("unmarshal: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("get item: %w", err)
|
||||
}
|
||||
|
||||
pages = append(pages, &entity.Page{
|
||||
ID: page.ID,
|
||||
URL: page.URL,
|
||||
Description: page.Description,
|
||||
Created: page.Created,
|
||||
Formats: page.Formats,
|
||||
Version: page.Version,
|
||||
Status: page.Status,
|
||||
})
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("view: %w", err)
|
||||
}
|
||||
|
||||
sort.Slice(pages, func(i, j int) bool {
|
||||
return pages[i].Created.After(pages[j].Created)
|
||||
})
|
||||
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
func (p *Page) key(site *entity.Page) []byte {
|
||||
return append(p.prefix, []byte(site.ID.String())...)
|
||||
}
|
||||
60
adapters/repository/badger/page_test.go
Normal file
60
adapters/repository/badger/page_test.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package badger
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/zap/zaptest"
|
||||
|
||||
"github.com/derfenix/webarchive/entity"
|
||||
)
|
||||
|
||||
func TestSite(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if testing.Short() {
|
||||
t.Skip("skip db test")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
tempDir, err := os.MkdirTemp(os.TempDir(), "badger_test")
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Cleanup(func() {
|
||||
assert.NoError(t, os.RemoveAll(tempDir))
|
||||
})
|
||||
|
||||
log := zaptest.NewLogger(t)
|
||||
|
||||
db, err := NewBadger(tempDir, log.Named("db"))
|
||||
require.NoError(t, err)
|
||||
|
||||
siteRepo, err := NewPage(db, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Run("base path", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
site := entity.NewPage("https://google.com", "Save all google", entity.FormatPDF, entity.FormatSingleFile)
|
||||
site.Created = site.Created.Truncate(time.Microsecond)
|
||||
|
||||
err := siteRepo.Save(ctx, site)
|
||||
require.NoError(t, err)
|
||||
|
||||
storedSite, err := siteRepo.Get(ctx, site.ID)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, site, storedSite)
|
||||
|
||||
all, err := siteRepo.ListAll(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, all, 1)
|
||||
|
||||
assert.Equal(t, site, all[0])
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user