mirror of
https://github.com/derfenix/webarchive.git
synced 2026-03-11 12:41:54 +03:00
Refactoring
This commit is contained in:
6
.idea/jsonSchemas.xml
generated
6
.idea/jsonSchemas.xml
generated
@@ -3,11 +3,11 @@
|
||||
<component name="JsonSchemaMappingsProjectConfiguration">
|
||||
<state>
|
||||
<map>
|
||||
<entry key="openapi">
|
||||
<entry key="OpenAPI 3.0">
|
||||
<value>
|
||||
<SchemaInfo>
|
||||
<option name="name" value="openapi" />
|
||||
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/OAI/OpenAPI-Specification/main/schemas/v3.1/schema.json" />
|
||||
<option name="name" value="OpenAPI 3.0" />
|
||||
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/OAI/OpenAPI-Specification/master/schemas/v3.0/schema.json" />
|
||||
<option name="applicationDefined" value="true" />
|
||||
<option name="patterns">
|
||||
<list>
|
||||
|
||||
@@ -116,3 +116,4 @@ curl -X GET --location "http://localhost:5001/api/v1/pages" | jq .
|
||||
- [ ] Multi-user access
|
||||
- [ ] Support SQL database with or without separate files storage
|
||||
- [ ] Tags/Categories
|
||||
- [ ] Save page to markdown
|
||||
|
||||
@@ -40,6 +40,7 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
|
||||
page := wkhtmltopdf.NewPage(url)
|
||||
page.PrintMediaType.Set(p.cfg.MediaPrint)
|
||||
page.JavascriptDelay.Set(200)
|
||||
page.LoadErrorHandling.Set("ignore")
|
||||
page.LoadMediaErrorHandling.Set("ignore")
|
||||
page.FooterRight.Set("[page]")
|
||||
page.HeaderLeft.Set(url)
|
||||
|
||||
@@ -112,17 +112,8 @@ func (s *SingleFile) findAndReplaceResources(ctx context.Context, node *html.Nod
|
||||
func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseURL string) error {
|
||||
for i, attribute := range node.Attr {
|
||||
if attribute.Key == "src" || attribute.Key == "href" {
|
||||
encoded, contentType, err := s.loadResource(ctx, attribute.Val, baseURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load resource for %s: %w", node.Data, err)
|
||||
}
|
||||
|
||||
if len(encoded) == 0 {
|
||||
attribute.Val = ""
|
||||
|
||||
} else {
|
||||
attribute.Val = fmt.Sprintf("data:%s;base64, %s", contentType, encoded)
|
||||
}
|
||||
raw, contentType := s.loadResource(ctx, attribute.Val, baseURL)
|
||||
setResource(raw, attribute, contentType, node)
|
||||
|
||||
node.Attr[i] = attribute
|
||||
}
|
||||
@@ -131,27 +122,68 @@ func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseU
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]byte, string, error) {
|
||||
func setResource(raw []byte, attribute html.Attribute, contentType string, node *html.Node) {
|
||||
if len(raw) == 0 {
|
||||
attribute.Val = ""
|
||||
} else {
|
||||
if strings.HasPrefix(contentType, "image") {
|
||||
encoded := make([]byte, base64.StdEncoding.EncodedLen(len(raw)))
|
||||
base64.StdEncoding.Encode(encoded, raw)
|
||||
attribute.Val = fmt.Sprintf("data:%s;base64, %s", contentType, encoded)
|
||||
} else {
|
||||
attribute.Val = ""
|
||||
var atomValue atom.Atom
|
||||
var data string
|
||||
|
||||
for _, attr := range node.Attr {
|
||||
if attr.Key == "type" {
|
||||
switch attr.Val {
|
||||
case "script":
|
||||
atomValue = atom.Script
|
||||
data = "script"
|
||||
case "stylesheet":
|
||||
atomValue = atom.Style
|
||||
data = "style"
|
||||
}
|
||||
}
|
||||
}
|
||||
newNode := &html.Node{
|
||||
NextSibling: node.NextSibling,
|
||||
Type: html.ElementNode,
|
||||
DataAtom: atomValue,
|
||||
Data: data,
|
||||
}
|
||||
newNode.AppendChild(&html.Node{
|
||||
Type: html.RawNode,
|
||||
DataAtom: atom.Data,
|
||||
Data: string(raw),
|
||||
})
|
||||
node.NextSibling = newNode
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]byte, string) {
|
||||
if !strings.HasPrefix(val, "http://") && !strings.HasPrefix(val, "https://") {
|
||||
var err error
|
||||
val, err = url.JoinPath(baseURL, val)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("join base path %s and url %s: %w", baseURL, val, err)
|
||||
return nil, ""
|
||||
}
|
||||
val, err = url.PathUnescape(val)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("unescape path %s: %w", val, err)
|
||||
return nil, ""
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, val, nil)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("new request: %w", err)
|
||||
return nil, ""
|
||||
}
|
||||
|
||||
response, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("do request: %w", err)
|
||||
return nil, ""
|
||||
}
|
||||
|
||||
defer func() {
|
||||
@@ -161,18 +193,15 @@ func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]b
|
||||
}()
|
||||
|
||||
if response.StatusCode != http.StatusOK {
|
||||
return []byte{}, "", nil
|
||||
return []byte{}, ""
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(response.Body)
|
||||
if err != nil {
|
||||
return nil, "", fmt.Errorf("read body: %w", err)
|
||||
return nil, ""
|
||||
}
|
||||
|
||||
encoded := make([]byte, base64.StdEncoding.EncodedLen(len(raw)))
|
||||
base64.StdEncoding.Encode(encoded, raw)
|
||||
|
||||
return encoded, response.Header.Get("Content-Type"), nil
|
||||
return raw, response.Header.Get("Content-Type")
|
||||
}
|
||||
|
||||
func (s *SingleFile) setCharset(node *html.Node, encoding string) {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
openapi: 3.1.0
|
||||
---
|
||||
openapi: 3.0.3
|
||||
info:
|
||||
title: Sample API
|
||||
description: API description in Markdown.
|
||||
@@ -125,7 +126,7 @@ paths:
|
||||
200:
|
||||
description: File content
|
||||
content:
|
||||
application/pdf: { }
|
||||
application/pdf: {}
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
@@ -49,7 +49,7 @@ func NewApplication(cfg config.Config) (Application, error) {
|
||||
worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker"))
|
||||
|
||||
server, err := openapi.NewServer(
|
||||
rest.NewService(pageRepo, workerCh, processor),
|
||||
rest.NewService(pageRepo, workerCh),
|
||||
openapi.WithPathPrefix("/api/v1"),
|
||||
openapi.WithMiddleware(
|
||||
func(r middleware.Request, next middleware.Next) (middleware.Response, error) {
|
||||
|
||||
@@ -75,6 +75,13 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
|
||||
innerWG := sync.WaitGroup{}
|
||||
innerWG.Add(len(p.Formats))
|
||||
|
||||
meta, err := processor.GetMeta(ctx, p.URL)
|
||||
if err != nil {
|
||||
p.Meta.Error = err.Error()
|
||||
} else {
|
||||
p.Meta = meta
|
||||
}
|
||||
|
||||
for _, format := range p.Formats {
|
||||
go func(format Format) {
|
||||
defer innerWG.Done()
|
||||
|
||||
2
go.mod
2
go.mod
@@ -18,6 +18,7 @@ require (
|
||||
go.opentelemetry.io/otel/trace v1.14.0
|
||||
go.uber.org/multierr v1.10.0
|
||||
go.uber.org/zap v1.24.0
|
||||
golang.org/x/net v0.8.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -49,7 +50,6 @@ require (
|
||||
go.opencensus.io v0.24.0 // indirect
|
||||
go.uber.org/atomic v1.9.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20230206171751-46f607a40771 // indirect
|
||||
golang.org/x/net v0.8.0 // indirect
|
||||
golang.org/x/sync v0.1.0 // indirect
|
||||
golang.org/x/sys v0.6.0 // indirect
|
||||
golang.org/x/text v0.8.0 // indirect
|
||||
|
||||
@@ -20,19 +20,17 @@ type Pages interface {
|
||||
GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error)
|
||||
}
|
||||
|
||||
func NewService(pages Pages, ch chan *entity.Page, processor entity.Processor) *Service {
|
||||
func NewService(pages Pages, ch chan *entity.Page) *Service {
|
||||
return &Service{
|
||||
pages: pages,
|
||||
ch: ch,
|
||||
processor: processor,
|
||||
pages: pages,
|
||||
ch: ch,
|
||||
}
|
||||
}
|
||||
|
||||
type Service struct {
|
||||
openapi.UnimplementedHandler
|
||||
pages Pages
|
||||
ch chan *entity.Page
|
||||
processor entity.Processor
|
||||
pages Pages
|
||||
ch chan *entity.Page
|
||||
}
|
||||
|
||||
func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) {
|
||||
@@ -83,13 +81,6 @@ func (s *Service) AddPage(ctx context.Context, req openapi.OptAddPageReq, params
|
||||
page := entity.NewPage(url, description, domainFormats...)
|
||||
page.Status = entity.StatusProcessing
|
||||
|
||||
meta, err := s.processor.GetMeta(ctx, page.URL)
|
||||
if err != nil {
|
||||
page.Meta.Error = err.Error()
|
||||
} else {
|
||||
page.Meta = meta
|
||||
}
|
||||
|
||||
if err := s.pages.Save(ctx, page); err != nil {
|
||||
return nil, fmt.Errorf("save page: %w", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user