mirror of
https://github.com/derfenix/webarchive.git
synced 2026-03-11 12:41:54 +03:00
Refactoring
This commit is contained in:
6
.idea/jsonSchemas.xml
generated
6
.idea/jsonSchemas.xml
generated
@@ -3,11 +3,11 @@
|
|||||||
<component name="JsonSchemaMappingsProjectConfiguration">
|
<component name="JsonSchemaMappingsProjectConfiguration">
|
||||||
<state>
|
<state>
|
||||||
<map>
|
<map>
|
||||||
<entry key="openapi">
|
<entry key="OpenAPI 3.0">
|
||||||
<value>
|
<value>
|
||||||
<SchemaInfo>
|
<SchemaInfo>
|
||||||
<option name="name" value="openapi" />
|
<option name="name" value="OpenAPI 3.0" />
|
||||||
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/OAI/OpenAPI-Specification/main/schemas/v3.1/schema.json" />
|
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/OAI/OpenAPI-Specification/master/schemas/v3.0/schema.json" />
|
||||||
<option name="applicationDefined" value="true" />
|
<option name="applicationDefined" value="true" />
|
||||||
<option name="patterns">
|
<option name="patterns">
|
||||||
<list>
|
<list>
|
||||||
|
|||||||
@@ -116,3 +116,4 @@ curl -X GET --location "http://localhost:5001/api/v1/pages" | jq .
|
|||||||
- [ ] Multi-user access
|
- [ ] Multi-user access
|
||||||
- [ ] Support SQL database with or without separate files storage
|
- [ ] Support SQL database with or without separate files storage
|
||||||
- [ ] Tags/Categories
|
- [ ] Tags/Categories
|
||||||
|
- [ ] Save page to markdown
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
|
|||||||
page := wkhtmltopdf.NewPage(url)
|
page := wkhtmltopdf.NewPage(url)
|
||||||
page.PrintMediaType.Set(p.cfg.MediaPrint)
|
page.PrintMediaType.Set(p.cfg.MediaPrint)
|
||||||
page.JavascriptDelay.Set(200)
|
page.JavascriptDelay.Set(200)
|
||||||
|
page.LoadErrorHandling.Set("ignore")
|
||||||
page.LoadMediaErrorHandling.Set("ignore")
|
page.LoadMediaErrorHandling.Set("ignore")
|
||||||
page.FooterRight.Set("[page]")
|
page.FooterRight.Set("[page]")
|
||||||
page.HeaderLeft.Set(url)
|
page.HeaderLeft.Set(url)
|
||||||
|
|||||||
@@ -112,17 +112,8 @@ func (s *SingleFile) findAndReplaceResources(ctx context.Context, node *html.Nod
|
|||||||
func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseURL string) error {
|
func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseURL string) error {
|
||||||
for i, attribute := range node.Attr {
|
for i, attribute := range node.Attr {
|
||||||
if attribute.Key == "src" || attribute.Key == "href" {
|
if attribute.Key == "src" || attribute.Key == "href" {
|
||||||
encoded, contentType, err := s.loadResource(ctx, attribute.Val, baseURL)
|
raw, contentType := s.loadResource(ctx, attribute.Val, baseURL)
|
||||||
if err != nil {
|
setResource(raw, attribute, contentType, node)
|
||||||
return fmt.Errorf("load resource for %s: %w", node.Data, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(encoded) == 0 {
|
|
||||||
attribute.Val = ""
|
|
||||||
|
|
||||||
} else {
|
|
||||||
attribute.Val = fmt.Sprintf("data:%s;base64, %s", contentType, encoded)
|
|
||||||
}
|
|
||||||
|
|
||||||
node.Attr[i] = attribute
|
node.Attr[i] = attribute
|
||||||
}
|
}
|
||||||
@@ -131,27 +122,68 @@ func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseU
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]byte, string, error) {
|
func setResource(raw []byte, attribute html.Attribute, contentType string, node *html.Node) {
|
||||||
|
if len(raw) == 0 {
|
||||||
|
attribute.Val = ""
|
||||||
|
} else {
|
||||||
|
if strings.HasPrefix(contentType, "image") {
|
||||||
|
encoded := make([]byte, base64.StdEncoding.EncodedLen(len(raw)))
|
||||||
|
base64.StdEncoding.Encode(encoded, raw)
|
||||||
|
attribute.Val = fmt.Sprintf("data:%s;base64, %s", contentType, encoded)
|
||||||
|
} else {
|
||||||
|
attribute.Val = ""
|
||||||
|
var atomValue atom.Atom
|
||||||
|
var data string
|
||||||
|
|
||||||
|
for _, attr := range node.Attr {
|
||||||
|
if attr.Key == "type" {
|
||||||
|
switch attr.Val {
|
||||||
|
case "script":
|
||||||
|
atomValue = atom.Script
|
||||||
|
data = "script"
|
||||||
|
case "stylesheet":
|
||||||
|
atomValue = atom.Style
|
||||||
|
data = "style"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newNode := &html.Node{
|
||||||
|
NextSibling: node.NextSibling,
|
||||||
|
Type: html.ElementNode,
|
||||||
|
DataAtom: atomValue,
|
||||||
|
Data: data,
|
||||||
|
}
|
||||||
|
newNode.AppendChild(&html.Node{
|
||||||
|
Type: html.RawNode,
|
||||||
|
DataAtom: atom.Data,
|
||||||
|
Data: string(raw),
|
||||||
|
})
|
||||||
|
node.NextSibling = newNode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]byte, string) {
|
||||||
if !strings.HasPrefix(val, "http://") && !strings.HasPrefix(val, "https://") {
|
if !strings.HasPrefix(val, "http://") && !strings.HasPrefix(val, "https://") {
|
||||||
var err error
|
var err error
|
||||||
val, err = url.JoinPath(baseURL, val)
|
val, err = url.JoinPath(baseURL, val)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("join base path %s and url %s: %w", baseURL, val, err)
|
return nil, ""
|
||||||
}
|
}
|
||||||
val, err = url.PathUnescape(val)
|
val, err = url.PathUnescape(val)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("unescape path %s: %w", val, err)
|
return nil, ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, val, nil)
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, val, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("new request: %w", err)
|
return nil, ""
|
||||||
}
|
}
|
||||||
|
|
||||||
response, err := s.client.Do(req)
|
response, err := s.client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("do request: %w", err)
|
return nil, ""
|
||||||
}
|
}
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
@@ -161,18 +193,15 @@ func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]b
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
if response.StatusCode != http.StatusOK {
|
if response.StatusCode != http.StatusOK {
|
||||||
return []byte{}, "", nil
|
return []byte{}, ""
|
||||||
}
|
}
|
||||||
|
|
||||||
raw, err := io.ReadAll(response.Body)
|
raw, err := io.ReadAll(response.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("read body: %w", err)
|
return nil, ""
|
||||||
}
|
}
|
||||||
|
|
||||||
encoded := make([]byte, base64.StdEncoding.EncodedLen(len(raw)))
|
return raw, response.Header.Get("Content-Type")
|
||||||
base64.StdEncoding.Encode(encoded, raw)
|
|
||||||
|
|
||||||
return encoded, response.Header.Get("Content-Type"), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SingleFile) setCharset(node *html.Node, encoding string) {
|
func (s *SingleFile) setCharset(node *html.Node, encoding string) {
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
openapi: 3.1.0
|
---
|
||||||
|
openapi: 3.0.3
|
||||||
info:
|
info:
|
||||||
title: Sample API
|
title: Sample API
|
||||||
description: API description in Markdown.
|
description: API description in Markdown.
|
||||||
@@ -125,7 +126,7 @@ paths:
|
|||||||
200:
|
200:
|
||||||
description: File content
|
description: File content
|
||||||
content:
|
content:
|
||||||
application/pdf: { }
|
application/pdf: {}
|
||||||
text/plain:
|
text/plain:
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ func NewApplication(cfg config.Config) (Application, error) {
|
|||||||
worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker"))
|
worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker"))
|
||||||
|
|
||||||
server, err := openapi.NewServer(
|
server, err := openapi.NewServer(
|
||||||
rest.NewService(pageRepo, workerCh, processor),
|
rest.NewService(pageRepo, workerCh),
|
||||||
openapi.WithPathPrefix("/api/v1"),
|
openapi.WithPathPrefix("/api/v1"),
|
||||||
openapi.WithMiddleware(
|
openapi.WithMiddleware(
|
||||||
func(r middleware.Request, next middleware.Next) (middleware.Response, error) {
|
func(r middleware.Request, next middleware.Next) (middleware.Response, error) {
|
||||||
|
|||||||
@@ -75,6 +75,13 @@ func (p *Page) Process(ctx context.Context, processor Processor) {
|
|||||||
innerWG := sync.WaitGroup{}
|
innerWG := sync.WaitGroup{}
|
||||||
innerWG.Add(len(p.Formats))
|
innerWG.Add(len(p.Formats))
|
||||||
|
|
||||||
|
meta, err := processor.GetMeta(ctx, p.URL)
|
||||||
|
if err != nil {
|
||||||
|
p.Meta.Error = err.Error()
|
||||||
|
} else {
|
||||||
|
p.Meta = meta
|
||||||
|
}
|
||||||
|
|
||||||
for _, format := range p.Formats {
|
for _, format := range p.Formats {
|
||||||
go func(format Format) {
|
go func(format Format) {
|
||||||
defer innerWG.Done()
|
defer innerWG.Done()
|
||||||
|
|||||||
2
go.mod
2
go.mod
@@ -18,6 +18,7 @@ require (
|
|||||||
go.opentelemetry.io/otel/trace v1.14.0
|
go.opentelemetry.io/otel/trace v1.14.0
|
||||||
go.uber.org/multierr v1.10.0
|
go.uber.org/multierr v1.10.0
|
||||||
go.uber.org/zap v1.24.0
|
go.uber.org/zap v1.24.0
|
||||||
|
golang.org/x/net v0.8.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@@ -49,7 +50,6 @@ require (
|
|||||||
go.opencensus.io v0.24.0 // indirect
|
go.opencensus.io v0.24.0 // indirect
|
||||||
go.uber.org/atomic v1.9.0 // indirect
|
go.uber.org/atomic v1.9.0 // indirect
|
||||||
golang.org/x/exp v0.0.0-20230206171751-46f607a40771 // indirect
|
golang.org/x/exp v0.0.0-20230206171751-46f607a40771 // indirect
|
||||||
golang.org/x/net v0.8.0 // indirect
|
|
||||||
golang.org/x/sync v0.1.0 // indirect
|
golang.org/x/sync v0.1.0 // indirect
|
||||||
golang.org/x/sys v0.6.0 // indirect
|
golang.org/x/sys v0.6.0 // indirect
|
||||||
golang.org/x/text v0.8.0 // indirect
|
golang.org/x/text v0.8.0 // indirect
|
||||||
|
|||||||
@@ -20,19 +20,17 @@ type Pages interface {
|
|||||||
GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error)
|
GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewService(pages Pages, ch chan *entity.Page, processor entity.Processor) *Service {
|
func NewService(pages Pages, ch chan *entity.Page) *Service {
|
||||||
return &Service{
|
return &Service{
|
||||||
pages: pages,
|
pages: pages,
|
||||||
ch: ch,
|
ch: ch,
|
||||||
processor: processor,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type Service struct {
|
type Service struct {
|
||||||
openapi.UnimplementedHandler
|
openapi.UnimplementedHandler
|
||||||
pages Pages
|
pages Pages
|
||||||
ch chan *entity.Page
|
ch chan *entity.Page
|
||||||
processor entity.Processor
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) {
|
func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) {
|
||||||
@@ -83,13 +81,6 @@ func (s *Service) AddPage(ctx context.Context, req openapi.OptAddPageReq, params
|
|||||||
page := entity.NewPage(url, description, domainFormats...)
|
page := entity.NewPage(url, description, domainFormats...)
|
||||||
page.Status = entity.StatusProcessing
|
page.Status = entity.StatusProcessing
|
||||||
|
|
||||||
meta, err := s.processor.GetMeta(ctx, page.URL)
|
|
||||||
if err != nil {
|
|
||||||
page.Meta.Error = err.Error()
|
|
||||||
} else {
|
|
||||||
page.Meta = meta
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := s.pages.Save(ctx, page); err != nil {
|
if err := s.pages.Save(ctx, page); err != nil {
|
||||||
return nil, fmt.Errorf("save page: %w", err)
|
return nil, fmt.Errorf("save page: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user