10 Commits

Author SHA1 Message Date
f47dbefb67 web ui: index and basic details page, api refactoring 2023-04-04 23:02:02 +03:00
2a8b94136f web ui: basic logic 2023-04-04 16:24:35 +03:00
790eece361 Add roadmap item 2023-04-04 08:47:15 +03:00
f517a0e3a6 Update LICENSE.txt 2023-04-03 21:50:59 +03:00
dbb6d6f968 Improve docker-compose.yaml 2023-04-03 20:36:50 +03:00
a4f9022f40 Use prebuilt image in docker-compose.yaml 2023-04-03 19:12:43 +03:00
0a6b247765 Fix github actions 2023-04-03 18:40:34 +03:00
b7533d407f Hide pdf processor test with tag 2023-04-03 16:56:11 +03:00
7d4056e312 Update github actions 2023-04-03 16:54:30 +03:00
695021dae6 Add github actions 2023-04-03 16:50:32 +03:00
25 changed files with 1014 additions and 46 deletions

47
.github/workflows/release.yaml vendored Normal file
View File

@@ -0,0 +1,47 @@
name: release
on:
push:
tags:
- 'v*'
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: 1.20.x
- name: Checkout code
uses: actions/checkout@v3
- name: Setup Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v2
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: ghcr.io/derfenix/webarchive
- name: Build and push
uses: docker/build-push-action@v4
with:
push: true
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
tags: |
ghcr.io/derfenix/webarchive:latest
ghcr.io/derfenix/webarchive:${{github.ref_name}}
labels: ${{ steps.meta.outputs.labels }}

58
.github/workflows/test.yaml vendored Normal file
View File

@@ -0,0 +1,58 @@
name: test
on:
pull_request:
push:
branches:
- master
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: 1.20.x
- name: Checkout code
uses: actions/checkout@v3
- name: go mod package cache
uses: actions/cache@v3
with:
path: |
~/.cache/go-build
~/go/pkg/mod
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-
- name: Tests
run: go test ./...
- name: golangci-lint
uses: golangci/golangci-lint-action@v3
with:
# Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
version: latest
# Optional: working directory, useful for monorepos
# working-directory: somedir
# Optional: golangci-lint command line arguments.
# args: --issues-exit-code=0
# Optional: show only new issues if it's a pull request. The default value is `false`.
# only-new-issues: true
# Optional: if set to true then the all caching functionality will be complete disabled,
# takes precedence over all other caching options.
# skip-cache: true
# Optional: if set to true then the action don't cache or restore ~/go/pkg.
# skip-pkg-cache: true
# Optional: if set to true then the action don't cache or restore ~/.cache/go-build.
# skip-build-cache: true

14
.idea/webResources.xml generated Normal file
View File

@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="WebResourcesPaths">
<contentEntries>
<entry url="file://$PROJECT_DIR$">
<entryData>
<resourceRoots>
<path value="file://$PROJECT_DIR$/ui" />
</resourceRoots>
</entryData>
</entry>
</contentEntries>
</component>
</project>

View File

@@ -2,14 +2,10 @@ Copyright (c) 2023, derfenix <derfenix@gmail.com> All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3) All advertising materials mentioning features or use of this software must display the following acknowledgement: 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
"This product includes software developed by the University of California, Berkeley and its contributors." THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4) Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -24,13 +24,17 @@ variables:
* **LOGGING_DEBUG** — enable debug logs (default `false`) * **LOGGING_DEBUG** — enable debug logs (default `false`)
* **API** * **API**
* **API_ADDRESS** — address the API server will listen (default `0.0.0.0:5001`) * **API_ADDRESS** — address the API server will listen (default `0.0.0.0:5001`)
* **UI**
* **UI_ENABLED** — Enable builtin web UI (default `true`)
* **UI_PREFIX** — Prefix for the web UI (default `/`)
* **UI_THEME** — UI theme name (default `basic`). No other values available yet
* **PDF** * **PDF**
* **PDF_LANDSCAPE** — use landscape page orientation instead of portrait (default `false`) * **PDF_LANDSCAPE** — use landscape page orientation instead of portrait (default `false`)
* **PDF_GRAYSCALE** — use grayscale filter for the output pdf (default `false`) * **PDF_GRAYSCALE** — use grayscale filter for the output pdf (default `false`)
* **PDF_MEDIA_PRINT** — use media type `print` for the request (default `true`) * **PDF_MEDIA_PRINT** — use media type `print` for the request (default `true`)
* **PDF_ZOOM** — zoom page (default `1.0` i.e. no actual zoom) * **PDF_ZOOM** — zoom page (default `1.0` i.e. no actual zoom)
* **PDF_VIEWPORT** — use specified viewport value (default `1920x1080`) * **PDF_VIEWPORT** — use specified viewport value (default `1280x720`)
* **PDF_DPI** — use specified DPI value for the output pdf (default `300`) * **PDF_DPI** — use specified DPI value for the output pdf (default `150`)
* **PDF_FILENAME** — use specified name for output pdf file (default `page.pdf`) * **PDF_FILENAME** — use specified name for output pdf file (default `page.pdf`)
@@ -60,7 +64,7 @@ docker compose up -d webarchive
### 2. Add a page ### 2. Add a page
```shell ```shell
curl -X POST --location "http://localhost:5001/pages" \ curl -X POST --location "http://localhost:5001/api/v1/pages" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d "{ -d "{
\"url\": \"https://github.com/wkhtmltopdf/wkhtmltopdf/issues/1937\", \"url\": \"https://github.com/wkhtmltopdf/wkhtmltopdf/issues/1937\",
@@ -75,13 +79,13 @@ or
```shell ```shell
curl -X POST --location \ curl -X POST --location \
"http://localhost:5001/pages?url=https%3A%2F%2Fgithub.com%2Fwkhtmltopdf%2Fwkhtmltopdf%2Fissues%2F1937&formats=pdf%2Cheaders&description=Foo+Bar" "http://localhost:5001/api/v1/pages?url=https%3A%2F%2Fgithub.com%2Fwkhtmltopdf%2Fwkhtmltopdf%2Fissues%2F1937&formats=pdf%2Cheaders&description=Foo+Bar"
``` ```
### 3. Get the page's info ### 3. Get the page's info
```shell ```shell
curl -X GET --location "http://localhost:5001/pages/$page_id" | jq . curl -X GET --location "http://localhost:5001/api/v1/pages/$page_id" | jq .
``` ```
where `$page_id` — value of the `id` field from previous command response. where `$page_id` — value of the `id` field from previous command response.
If `status` field in response is `success` (or `with_errors`) - the `results` field If `status` field in response is `success` (or `with_errors`) - the `results` field
@@ -90,7 +94,7 @@ will contain all processed formats with ids of the stored files.
### 4. Open file in browser ### 4. Open file in browser
```shell ```shell
xdg-open "http://localhost:5001/pages/$page_id/file/$file_id" xdg-open "http://localhost:5001/api/v1/pages/$page_id/file/$file_id"
``` ```
Where `$page_id` — value of the `id` field from previous command response, and Where `$page_id` — value of the `id` field from previous command response, and
`$file_id` — the id of interesting file. `$file_id` — the id of interesting file.
@@ -98,7 +102,7 @@ Where `$page_id` — value of the `id` field from previous command response, an
### 5. List all stored pages ### 5. List all stored pages
```shell ```shell
curl -X GET --location "http://localhost:5001/pages" | jq . curl -X GET --location "http://localhost:5001/api/v1/pages" | jq .
``` ```
## Roadmap ## Roadmap
@@ -111,3 +115,4 @@ curl -X GET --location "http://localhost:5001/pages" | jq .
- [ ] Optional authentication - [ ] Optional authentication
- [ ] Multi-user access - [ ] Multi-user access
- [ ] Support SQL database with or without separate files storage - [ ] Support SQL database with or without separate files storage
- [ ] Tags/Categories

View File

@@ -47,6 +47,7 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
page.FooterFontSize.Set(10) page.FooterFontSize.Set(10)
page.Zoom.Set(p.cfg.Zoom) page.Zoom.Set(p.cfg.Zoom)
page.ViewportSize.Set(p.cfg.Viewport) page.ViewportSize.Set(p.cfg.Viewport)
page.NoBackground.Set(true)
gen.AddPage(page) gen.AddPage(page)

View File

@@ -1,3 +1,5 @@
//go:build integration
package processors package processors
import ( import (

View File

@@ -8,6 +8,8 @@ import (
"net/http/cookiejar" "net/http/cookiejar"
"time" "time"
"golang.org/x/net/html"
"github.com/derfenix/webarchive/config" "github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/entity" "github.com/derfenix/webarchive/entity"
) )
@@ -52,6 +54,7 @@ func NewProcessors(cfg config.Config) (*Processors, error) {
} }
procs := Processors{ procs := Processors{
client: httpClient,
processors: map[entity.Format]processor{ processors: map[entity.Format]processor{
entity.FormatHeaders: NewHeaders(httpClient), entity.FormatHeaders: NewHeaders(httpClient),
entity.FormatPDF: NewPDF(cfg.PDF), entity.FormatPDF: NewPDF(cfg.PDF),
@@ -64,6 +67,7 @@ func NewProcessors(cfg config.Config) (*Processors, error) {
type Processors struct { type Processors struct {
processors map[entity.Format]processor processors map[entity.Format]processor
client *http.Client
} }
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result { func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
@@ -93,3 +97,62 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err
return nil return nil
} }
func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return entity.Meta{}, fmt.Errorf("new request: %w", err)
}
response, err := p.client.Do(req)
if err != nil {
return entity.Meta{}, fmt.Errorf("do request: %w", err)
}
if response.StatusCode != http.StatusOK {
return entity.Meta{}, fmt.Errorf("want status 200, got %d", response.StatusCode)
}
if response.Body == nil {
return entity.Meta{}, fmt.Errorf("empty response body")
}
defer func() {
_ = response.Body.Close()
}()
htmlNode, err := html.Parse(response.Body)
if err != nil {
return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
}
meta := entity.Meta{}
getMetaData(htmlNode, &meta)
return meta, nil
}
func getMetaData(n *html.Node, meta *entity.Meta) {
if n == nil {
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "title" {
meta.Title = c.FirstChild.Data
}
if c.Type == html.ElementNode && c.Data == "meta" {
attrs := make(map[string]string)
for _, attr := range c.Attr {
attrs[attr.Key] = attr.Val
}
name, ok := attrs["name"]
if ok && name == "description" {
meta.Description = attrs["content"]
}
}
getMetaData(c, meta)
}
}

View File

@@ -64,18 +64,18 @@ func (p *Page) GetFile(_ context.Context, pageID, fileID uuid.UUID) (*entity.Fil
return file, nil return file, nil
} }
func (p *Page) Save(_ context.Context, site *entity.Page) error { func (p *Page) Save(_ context.Context, page *entity.Page) error {
if p.db.IsClosed() { if p.db.IsClosed() {
return ErrDBClosed return ErrDBClosed
} }
marshaled, err := marshal(site) marshaled, err := marshal(page)
if err != nil { if err != nil {
return fmt.Errorf("marshal data: %w", err) return fmt.Errorf("marshal data: %w", err)
} }
if err := p.db.Update(func(txn *badger.Txn) error { if err := p.db.Update(func(txn *badger.Txn) error {
if err := txn.Set(p.key(site), marshaled); err != nil { if err := txn.Set(p.key(page), marshaled); err != nil {
return fmt.Errorf("put data: %w", err) return fmt.Errorf("put data: %w", err)
} }
@@ -151,6 +151,64 @@ func (p *Page) ListAll(ctx context.Context) ([]*entity.Page, error) {
Formats: page.Formats, Formats: page.Formats,
Version: page.Version, Version: page.Version,
Status: page.Status, Status: page.Status,
Meta: page.Meta,
})
}
return nil
})
if err != nil {
return nil, fmt.Errorf("view: %w", err)
}
sort.Slice(pages, func(i, j int) bool {
return pages[i].Created.After(pages[j].Created)
})
return pages, nil
}
func (p *Page) ListUnprocessed(ctx context.Context) ([]*entity.Page, error) {
pages := make([]*entity.Page, 0, 100)
err := p.db.View(func(txn *badger.Txn) error {
iterator := txn.NewIterator(badger.DefaultIteratorOptions)
defer iterator.Close()
for iterator.Seek(p.prefix); iterator.ValidForPrefix(p.prefix); iterator.Next() {
if err := ctx.Err(); err != nil {
return fmt.Errorf("context canceled: %w", err)
}
var page entity.Page
err := iterator.Item().Value(func(val []byte) error {
if err := unmarshal(val, &page); err != nil {
return fmt.Errorf("unmarshal: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("get item: %w", err)
}
if page.Status != entity.StatusProcessing {
continue
}
pages = append(pages, &entity.Page{
ID: page.ID,
URL: page.URL,
Description: page.Description,
Created: page.Created,
Formats: page.Formats,
Version: page.Version,
Status: page.Status,
Meta: page.Meta,
}) })
} }

View File

@@ -4,7 +4,7 @@ info:
description: API description in Markdown. description: API description in Markdown.
version: 1.0.0 version: 1.0.0
servers: servers:
- url: 'https://api.example.com' - url: 'https://api.example.com/api/v1'
paths: paths:
/pages: /pages:
get: get:
@@ -183,12 +183,25 @@ components:
$ref: '#/components/schemas/format' $ref: '#/components/schemas/format'
status: status:
$ref: '#/components/schemas/status' $ref: '#/components/schemas/status'
meta:
type: object
properties:
title:
type: string
description:
type: string
error:
type: string
required:
- title
- description
required: required:
- id - id
- url - url
- formats - formats
- status - status
- created - created
- meta
result: result:
type: object type: object
properties: properties:

View File

@@ -534,14 +534,20 @@ func (s *Page) encodeFields(e *jx.Encoder) {
e.FieldStart("status") e.FieldStart("status")
s.Status.Encode(e) s.Status.Encode(e)
} }
{
e.FieldStart("meta")
s.Meta.Encode(e)
}
} }
var jsonFieldsNameOfPage = [5]string{ var jsonFieldsNameOfPage = [6]string{
0: "id", 0: "id",
1: "url", 1: "url",
2: "created", 2: "created",
3: "formats", 3: "formats",
4: "status", 4: "status",
5: "meta",
} }
// Decode decodes Page from json. // Decode decodes Page from json.
@@ -617,6 +623,16 @@ func (s *Page) Decode(d *jx.Decoder) error {
}(); err != nil { }(); err != nil {
return errors.Wrap(err, "decode field \"status\"") return errors.Wrap(err, "decode field \"status\"")
} }
case "meta":
requiredBitSet[0] |= 1 << 5
if err := func() error {
if err := s.Meta.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"meta\"")
}
default: default:
return d.Skip() return d.Skip()
} }
@@ -627,7 +643,7 @@ func (s *Page) Decode(d *jx.Decoder) error {
// Validate required fields. // Validate required fields.
var failures []validate.FieldError var failures []validate.FieldError
for i, mask := range [1]uint8{ for i, mask := range [1]uint8{
0b00011111, 0b00111111,
} { } {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 { if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR. // Mask only required fields and check equality to mask using XOR.
@@ -673,6 +689,138 @@ func (s *Page) UnmarshalJSON(data []byte) error {
return s.Decode(d) return s.Decode(d)
} }
// Encode implements json.Marshaler.
func (s *PageMeta) Encode(e *jx.Encoder) {
e.ObjStart()
s.encodeFields(e)
e.ObjEnd()
}
// encodeFields encodes fields.
func (s *PageMeta) encodeFields(e *jx.Encoder) {
{
e.FieldStart("title")
e.Str(s.Title)
}
{
e.FieldStart("description")
e.Str(s.Description)
}
{
if s.Error.Set {
e.FieldStart("error")
s.Error.Encode(e)
}
}
}
var jsonFieldsNameOfPageMeta = [3]string{
0: "title",
1: "description",
2: "error",
}
// Decode decodes PageMeta from json.
func (s *PageMeta) Decode(d *jx.Decoder) error {
if s == nil {
return errors.New("invalid: unable to decode PageMeta to nil")
}
var requiredBitSet [1]uint8
if err := d.ObjBytes(func(d *jx.Decoder, k []byte) error {
switch string(k) {
case "title":
requiredBitSet[0] |= 1 << 0
if err := func() error {
v, err := d.Str()
s.Title = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"title\"")
}
case "description":
requiredBitSet[0] |= 1 << 1
if err := func() error {
v, err := d.Str()
s.Description = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"description\"")
}
case "error":
if err := func() error {
s.Error.Reset()
if err := s.Error.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"error\"")
}
default:
return d.Skip()
}
return nil
}); err != nil {
return errors.Wrap(err, "decode PageMeta")
}
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00000011,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
//
// If XOR result is not zero, result is not equal to expected, so some fields are missed.
// Bits of fields which would be set are actually bits of missed fields.
missed := bits.OnesCount8(result)
for bitN := 0; bitN < missed; bitN++ {
bitIdx := bits.TrailingZeros8(result)
fieldIdx := i*8 + bitIdx
var name string
if fieldIdx < len(jsonFieldsNameOfPageMeta) {
name = jsonFieldsNameOfPageMeta[fieldIdx]
} else {
name = strconv.Itoa(fieldIdx)
}
failures = append(failures, validate.FieldError{
Name: name,
Error: validate.ErrFieldRequired,
})
// Reset bit.
result &^= 1 << bitIdx
}
}
}
if len(failures) > 0 {
return &validate.Error{Fields: failures}
}
return nil
}
// MarshalJSON implements stdjson.Marshaler.
func (s *PageMeta) MarshalJSON() ([]byte, error) {
e := jx.Encoder{}
s.Encode(&e)
return e.Bytes(), nil
}
// UnmarshalJSON implements stdjson.Unmarshaler.
func (s *PageMeta) UnmarshalJSON(data []byte) error {
d := jx.DecodeBytes(data)
return s.Decode(d)
}
// Encode implements json.Marshaler. // Encode implements json.Marshaler.
func (s *PageWithResults) Encode(e *jx.Encoder) { func (s *PageWithResults) Encode(e *jx.Encoder) {
e.ObjStart() e.ObjStart()
@@ -711,6 +859,11 @@ func (s *PageWithResults) encodeFields(e *jx.Encoder) {
e.FieldStart("status") e.FieldStart("status")
s.Status.Encode(e) s.Status.Encode(e)
} }
{
e.FieldStart("meta")
s.Meta.Encode(e)
}
{ {
e.FieldStart("results") e.FieldStart("results")
@@ -722,13 +875,14 @@ func (s *PageWithResults) encodeFields(e *jx.Encoder) {
} }
} }
var jsonFieldsNameOfPageWithResults = [6]string{ var jsonFieldsNameOfPageWithResults = [7]string{
0: "id", 0: "id",
1: "url", 1: "url",
2: "created", 2: "created",
3: "formats", 3: "formats",
4: "status", 4: "status",
5: "results", 5: "meta",
6: "results",
} }
// Decode decodes PageWithResults from json. // Decode decodes PageWithResults from json.
@@ -804,8 +958,18 @@ func (s *PageWithResults) Decode(d *jx.Decoder) error {
}(); err != nil { }(); err != nil {
return errors.Wrap(err, "decode field \"status\"") return errors.Wrap(err, "decode field \"status\"")
} }
case "results": case "meta":
requiredBitSet[0] |= 1 << 5 requiredBitSet[0] |= 1 << 5
if err := func() error {
if err := s.Meta.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"meta\"")
}
case "results":
requiredBitSet[0] |= 1 << 6
if err := func() error { if err := func() error {
s.Results = make([]Result, 0) s.Results = make([]Result, 0)
if err := d.Arr(func(d *jx.Decoder) error { if err := d.Arr(func(d *jx.Decoder) error {
@@ -832,7 +996,7 @@ func (s *PageWithResults) Decode(d *jx.Decoder) error {
// Validate required fields. // Validate required fields.
var failures []validate.FieldError var failures []validate.FieldError
for i, mask := range [1]uint8{ for i, mask := range [1]uint8{
0b00111111, 0b01111111,
} { } {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 { if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR. // Mask only required fields and check equality to mask using XOR.
@@ -878,6 +1042,138 @@ func (s *PageWithResults) UnmarshalJSON(data []byte) error {
return s.Decode(d) return s.Decode(d)
} }
// Encode implements json.Marshaler.
func (s *PageWithResultsMeta) Encode(e *jx.Encoder) {
e.ObjStart()
s.encodeFields(e)
e.ObjEnd()
}
// encodeFields encodes fields.
func (s *PageWithResultsMeta) encodeFields(e *jx.Encoder) {
{
e.FieldStart("title")
e.Str(s.Title)
}
{
e.FieldStart("description")
e.Str(s.Description)
}
{
if s.Error.Set {
e.FieldStart("error")
s.Error.Encode(e)
}
}
}
var jsonFieldsNameOfPageWithResultsMeta = [3]string{
0: "title",
1: "description",
2: "error",
}
// Decode decodes PageWithResultsMeta from json.
func (s *PageWithResultsMeta) Decode(d *jx.Decoder) error {
if s == nil {
return errors.New("invalid: unable to decode PageWithResultsMeta to nil")
}
var requiredBitSet [1]uint8
if err := d.ObjBytes(func(d *jx.Decoder, k []byte) error {
switch string(k) {
case "title":
requiredBitSet[0] |= 1 << 0
if err := func() error {
v, err := d.Str()
s.Title = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"title\"")
}
case "description":
requiredBitSet[0] |= 1 << 1
if err := func() error {
v, err := d.Str()
s.Description = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"description\"")
}
case "error":
if err := func() error {
s.Error.Reset()
if err := s.Error.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"error\"")
}
default:
return d.Skip()
}
return nil
}); err != nil {
return errors.Wrap(err, "decode PageWithResultsMeta")
}
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00000011,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
//
// If XOR result is not zero, result is not equal to expected, so some fields are missed.
// Bits of fields which would be set are actually bits of missed fields.
missed := bits.OnesCount8(result)
for bitN := 0; bitN < missed; bitN++ {
bitIdx := bits.TrailingZeros8(result)
fieldIdx := i*8 + bitIdx
var name string
if fieldIdx < len(jsonFieldsNameOfPageWithResultsMeta) {
name = jsonFieldsNameOfPageWithResultsMeta[fieldIdx]
} else {
name = strconv.Itoa(fieldIdx)
}
failures = append(failures, validate.FieldError{
Name: name,
Error: validate.ErrFieldRequired,
})
// Reset bit.
result &^= 1 << bitIdx
}
}
}
if len(failures) > 0 {
return &validate.Error{Fields: failures}
}
return nil
}
// MarshalJSON implements stdjson.Marshaler.
func (s *PageWithResultsMeta) MarshalJSON() ([]byte, error) {
e := jx.Encoder{}
s.Encode(&e)
return e.Bytes(), nil
}
// UnmarshalJSON implements stdjson.Unmarshaler.
func (s *PageWithResultsMeta) UnmarshalJSON(data []byte) error {
d := jx.DecodeBytes(data)
return s.Decode(d)
}
// Encode encodes Pages as json. // Encode encodes Pages as json.
func (s Pages) Encode(e *jx.Encoder) { func (s Pages) Encode(e *jx.Encoder) {
unwrapped := []Page(s) unwrapped := []Page(s)

View File

@@ -324,6 +324,7 @@ type Page struct {
Created time.Time `json:"created"` Created time.Time `json:"created"`
Formats []Format `json:"formats"` Formats []Format `json:"formats"`
Status Status `json:"status"` Status Status `json:"status"`
Meta PageMeta `json:"meta"`
} }
// GetID returns the value of ID. // GetID returns the value of ID.
@@ -351,6 +352,11 @@ func (s *Page) GetStatus() Status {
return s.Status return s.Status
} }
// GetMeta returns the value of Meta.
func (s *Page) GetMeta() PageMeta {
return s.Meta
}
// SetID sets the value of ID. // SetID sets the value of ID.
func (s *Page) SetID(val uuid.UUID) { func (s *Page) SetID(val uuid.UUID) {
s.ID = val s.ID = val
@@ -376,17 +382,59 @@ func (s *Page) SetStatus(val Status) {
s.Status = val s.Status = val
} }
// SetMeta sets the value of Meta.
func (s *Page) SetMeta(val PageMeta) {
s.Meta = val
}
func (*Page) addPageRes() {} func (*Page) addPageRes() {}
type PageMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Error OptString `json:"error"`
}
// GetTitle returns the value of Title.
func (s *PageMeta) GetTitle() string {
return s.Title
}
// GetDescription returns the value of Description.
func (s *PageMeta) GetDescription() string {
return s.Description
}
// GetError returns the value of Error.
func (s *PageMeta) GetError() OptString {
return s.Error
}
// SetTitle sets the value of Title.
func (s *PageMeta) SetTitle(val string) {
s.Title = val
}
// SetDescription sets the value of Description.
func (s *PageMeta) SetDescription(val string) {
s.Description = val
}
// SetError sets the value of Error.
func (s *PageMeta) SetError(val OptString) {
s.Error = val
}
// Merged schema. // Merged schema.
// Ref: #/components/schemas/pageWithResults // Ref: #/components/schemas/pageWithResults
type PageWithResults struct { type PageWithResults struct {
ID uuid.UUID `json:"id"` ID uuid.UUID `json:"id"`
URL string `json:"url"` URL string `json:"url"`
Created time.Time `json:"created"` Created time.Time `json:"created"`
Formats []Format `json:"formats"` Formats []Format `json:"formats"`
Status Status `json:"status"` Status Status `json:"status"`
Results []Result `json:"results"` Meta PageWithResultsMeta `json:"meta"`
Results []Result `json:"results"`
} }
// GetID returns the value of ID. // GetID returns the value of ID.
@@ -414,6 +462,11 @@ func (s *PageWithResults) GetStatus() Status {
return s.Status return s.Status
} }
// GetMeta returns the value of Meta.
func (s *PageWithResults) GetMeta() PageWithResultsMeta {
return s.Meta
}
// GetResults returns the value of Results. // GetResults returns the value of Results.
func (s *PageWithResults) GetResults() []Result { func (s *PageWithResults) GetResults() []Result {
return s.Results return s.Results
@@ -444,6 +497,11 @@ func (s *PageWithResults) SetStatus(val Status) {
s.Status = val s.Status = val
} }
// SetMeta sets the value of Meta.
func (s *PageWithResults) SetMeta(val PageWithResultsMeta) {
s.Meta = val
}
// SetResults sets the value of Results. // SetResults sets the value of Results.
func (s *PageWithResults) SetResults(val []Result) { func (s *PageWithResults) SetResults(val []Result) {
s.Results = val s.Results = val
@@ -451,6 +509,42 @@ func (s *PageWithResults) SetResults(val []Result) {
func (*PageWithResults) getPageRes() {} func (*PageWithResults) getPageRes() {}
type PageWithResultsMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Error OptString `json:"error"`
}
// GetTitle returns the value of Title.
func (s *PageWithResultsMeta) GetTitle() string {
return s.Title
}
// GetDescription returns the value of Description.
func (s *PageWithResultsMeta) GetDescription() string {
return s.Description
}
// GetError returns the value of Error.
func (s *PageWithResultsMeta) GetError() OptString {
return s.Error
}
// SetTitle sets the value of Title.
func (s *PageWithResultsMeta) SetTitle(val string) {
s.Title = val
}
// SetDescription sets the value of Description.
func (s *PageWithResultsMeta) SetDescription(val string) {
s.Description = val
}
// SetError sets the value of Error.
func (s *PageWithResultsMeta) SetError(val OptString) {
s.Error = val
}
type Pages []Page type Pages []Page
// Ref: #/components/schemas/result // Ref: #/components/schemas/result

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"net" "net"
"net/http" "net/http"
"strings"
"sync" "sync"
"time" "time"
@@ -48,7 +49,8 @@ func NewApplication(cfg config.Config) (Application, error) {
worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker")) worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker"))
server, err := openapi.NewServer( server, err := openapi.NewServer(
rest.NewService(pageRepo, workerCh), rest.NewService(pageRepo, workerCh, processor),
openapi.WithPathPrefix("/api/v1"),
openapi.WithMiddleware( openapi.WithMiddleware(
func(r middleware.Request, next middleware.Next) (middleware.Response, error) { func(r middleware.Request, next middleware.Next) (middleware.Response, error) {
start := time.Now() start := time.Now()
@@ -73,9 +75,25 @@ func NewApplication(cfg config.Config) (Application, error) {
return Application{}, fmt.Errorf("new rest server: %w", err) return Application{}, fmt.Errorf("new rest server: %w", err)
} }
var httpHandler http.Handler = server
if cfg.UI.Enabled {
ui := rest.NewUI(cfg.UI)
httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, "/api/") {
server.ServeHTTP(w, r)
return
}
ui.ServeHTTP(w, r)
})
}
httpServer := http.Server{ httpServer := http.Server{
Addr: cfg.API.Address, Addr: cfg.API.Address,
Handler: server, Handler: httpHandler,
ReadTimeout: time.Second * 15, ReadTimeout: time.Second * 15,
ReadHeaderTimeout: time.Second * 5, ReadHeaderTimeout: time.Second * 5,
IdleTimeout: time.Second * 30, IdleTimeout: time.Second * 30,

View File

@@ -28,6 +28,7 @@ type Config struct {
DB DB `env:",prefix=DB_"` DB DB `env:",prefix=DB_"`
Logging Logging `env:",prefix=LOGGING_"` Logging Logging `env:",prefix=LOGGING_"`
API API `env:",prefix=API_"` API API `env:",prefix=API_"`
UI UI `env:",prefix=UI_"`
PDF PDF `env:",prefix=PDF_"` PDF PDF `env:",prefix=PDF_"`
} }
@@ -36,8 +37,8 @@ type PDF struct {
Grayscale bool `env:"GRAYSCALE,default=false"` Grayscale bool `env:"GRAYSCALE,default=false"`
MediaPrint bool `env:"MEDIA_PRINT,default=true"` MediaPrint bool `env:"MEDIA_PRINT,default=true"`
Zoom float64 `env:"ZOOM,default=1"` Zoom float64 `env:"ZOOM,default=1"`
Viewport string `env:"VIEWPORT,default=1920x1080"` Viewport string `env:"VIEWPORT,default=1280x720"`
DPI uint `env:"DPI,default=300"` DPI uint `env:"DPI,default=150"`
Filename string `env:"FILENAME,default=page.pdf"` Filename string `env:"FILENAME,default=page.pdf"`
} }
@@ -45,6 +46,12 @@ type API struct {
Address string `env:"ADDRESS,default=0.0.0.0:5001"` Address string `env:"ADDRESS,default=0.0.0.0:5001"`
} }
type UI struct {
Enabled bool `env:"ENABLED,default=true"`
Prefix string `env:"PREFIX,default=/"`
Theme string `env:"THEME,default=basic"`
}
type DB struct { type DB struct {
Path string `env:"PATH,default=./db"` Path string `env:"PATH,default=./db"`
} }

View File

@@ -2,14 +2,15 @@ version: "3"
services: services:
webarchive: webarchive:
build: image: ghcr.io/derfenix/webarchive:latest
dockerfile: ./Dockerfile # build:
context: . # dockerfile: ./Dockerfile
# context: .
environment: environment:
LOGGING_DEBUG: true LOGGING_DEBUG: "true"
API_ADDRESS: 0.0.0.0:5001 API_ADDRESS: "0.0.0.0:5001"
PDF_DPI: 300 PDF_DPI: "300"
DB_PATH: /db DB_PATH: "/db"
volumes: volumes:
- ./db:/db - ./db:/db
ports: ports:

View File

@@ -11,6 +11,7 @@ import (
type Processor interface { type Processor interface {
Process(ctx context.Context, format Format, url string) Result Process(ctx context.Context, format Format, url string) Result
GetMeta(ctx context.Context, url string) (Meta, error)
} }
type Format uint8 type Format uint8
@@ -37,6 +38,12 @@ const (
StatusWithErrors StatusWithErrors
) )
type Meta struct {
Title string
Description string
Error string
}
func NewPage(url string, description string, formats ...Format) *Page { func NewPage(url string, description string, formats ...Format) *Page {
return &Page{ return &Page{
ID: uuid.New(), ID: uuid.New(),
@@ -57,6 +64,7 @@ type Page struct {
Results Results Results Results
Version uint16 Version uint16
Status Status Status Status
Meta Meta
} }
func (p *Page) SetProcessing() { func (p *Page) SetProcessing() {

View File

@@ -9,6 +9,7 @@ import (
type Pages interface { type Pages interface {
Save(ctx context.Context, page *Page) error Save(ctx context.Context, page *Page) error
ListUnprocessed(ctx context.Context) ([]*Page, error)
} }
func NewWorker(ch chan *Page, pages Pages, processor Processor, log *zap.Logger) *Worker { func NewWorker(ch chan *Page, pages Pages, processor Processor, log *zap.Logger) *Worker {
@@ -27,6 +28,20 @@ func (w *Worker) Start(ctx context.Context, wg *sync.WaitGroup) {
w.log.Info("starting") w.log.Info("starting")
wg.Add(1)
go func() {
defer wg.Done()
unprocessed, err := w.pages.ListUnprocessed(ctx)
if err != nil {
w.log.Error("failed to get unprocessed pages", zap.Error(err))
} else {
for i := range unprocessed {
w.ch <- unprocessed[i]
}
}
}()
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():

View File

@@ -2,6 +2,7 @@ package rest
import ( import (
"fmt" "fmt"
"html"
"github.com/derfenix/webarchive/api/openapi" "github.com/derfenix/webarchive/api/openapi"
"github.com/derfenix/webarchive/entity" "github.com/derfenix/webarchive/entity"
@@ -22,6 +23,11 @@ func PageToRestWithResults(page *entity.Page) openapi.PageWithResults {
return res return res
}(), }(),
Status: StatusToRest(page.Status), Status: StatusToRest(page.Status),
Meta: openapi.PageWithResultsMeta{
Title: html.EscapeString(page.Meta.Title),
Description: html.EscapeString(page.Meta.Description),
Error: openapi.NewOptString(page.Meta.Error),
},
Results: func() []openapi.Result { Results: func() []openapi.Result {
results := make([]openapi.Result, len(page.Results.Results())) results := make([]openapi.Result, len(page.Results.Results()))
@@ -65,6 +71,11 @@ func PageToRest(page *entity.Page) openapi.Page {
ID: page.ID, ID: page.ID,
URL: page.URL, URL: page.URL,
Created: page.Created, Created: page.Created,
Meta: openapi.PageMeta{
Title: html.EscapeString(page.Meta.Title),
Description: html.EscapeString(page.Meta.Description),
Error: openapi.NewOptString(page.Meta.Error),
},
Formats: func() []openapi.Format { Formats: func() []openapi.Format {
res := make([]openapi.Format, len(page.Formats)) res := make([]openapi.Format, len(page.Formats))

View File

@@ -20,14 +20,19 @@ type Pages interface {
GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error) GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error)
} }
func NewService(sites Pages, ch chan *entity.Page) *Service { func NewService(pages Pages, ch chan *entity.Page, processor entity.Processor) *Service {
return &Service{pages: sites, ch: ch} return &Service{
pages: pages,
ch: ch,
processor: processor,
}
} }
type Service struct { type Service struct {
openapi.UnimplementedHandler openapi.UnimplementedHandler
pages Pages pages Pages
ch chan *entity.Page ch chan *entity.Page
processor entity.Processor
} }
func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) { func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) {
@@ -78,6 +83,13 @@ func (s *Service) AddPage(ctx context.Context, req openapi.OptAddPageReq, params
page := entity.NewPage(url, description, domainFormats...) page := entity.NewPage(url, description, domainFormats...)
page.Status = entity.StatusProcessing page.Status = entity.StatusProcessing
meta, err := s.processor.GetMeta(ctx, page.URL)
if err != nil {
page.Meta.Error = err.Error()
} else {
page.Meta = meta
}
if err := s.pages.Save(ctx, page); err != nil { if err := s.pages.Save(ctx, page); err != nil {
return nil, fmt.Errorf("save page: %w", err) return nil, fmt.Errorf("save page: %w", err)
} }

41
ports/rest/ui.go Normal file
View File

@@ -0,0 +1,41 @@
package rest
import (
"io/fs"
"net/http"
"strings"
"github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/ui"
)
func NewUI(cfg config.UI) *UI {
return &UI{
prefix: cfg.Prefix,
theme: cfg.Theme,
}
}
type UI struct {
prefix string
theme string
}
func (u *UI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
serveRoot, err := fs.Sub(ui.StaticFiles, u.theme)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
return
}
if strings.HasPrefix(r.URL.Path, u.prefix) {
r.URL.Path = "/" + strings.TrimPrefix(r.URL.Path, u.prefix)
}
if !strings.HasPrefix(r.URL.Path, "/static") {
r.URL.Path = "/"
}
r.URL.Path = strings.TrimPrefix(r.URL.Path, "/static")
http.FileServer(http.FS(serveRoot)).ServeHTTP(w, r)
}

47
ui/basic/index.html Normal file
View File

@@ -0,0 +1,47 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>WebArchive</title>
<link rel="stylesheet" href="/static/style.css">
<script src="/static/lib.js"></script>
<script src="/static/main.js"></script>
</head>
<body>
<template id="pages_tmpl">
<div class="page_item">
<a class="url link"><span class="title"></span><span class="status"></span></a>
<div class="description"></div>
<div class="created"></div>
<hr>
</div>
</template>
<template id="page_tmpl">
<a onclick="history.back()" class="link">Back</a>
<div class="page">
<h2 id="page_title"></h2>
<h3 id="page_description"></h3>
<h5 id="page_url" class="link" onclick="window.open(this.innerHTML, '_blank')"></h5>
<h4>Results</h4>
<div id="results"></div>
</div>
</template>
<template id="result_tmpl">
<div class="result_item">
<span class="format"></span>
<span class="result_link link"></span>
</div>
</template>
<h1 id="site_title"></h1>
<div id="data">
None
</div>
</body>
</html>

2
ui/basic/lib.js Normal file

File diff suppressed because one or more lines are too long

90
ui/basic/main.js Normal file
View File

@@ -0,0 +1,90 @@
function index() {
$.ajax({
url: "/api/v1/pages", success: function (data, status, xhr) {
if (status !== "success") {
gotError(status);
return;
}
let elem = document.getElementById("data");
elem.innerHTML = "";
// elem.attachShadow({mode: 'open'});
data.forEach(function (v) {
let page_elem = pages_tmpl.content.cloneNode(true);
$(page_elem).find(".url").attr("onclick", "goToPage('" + v.id + "');");
$(page_elem).find(".status").addClass(v.status);
$(page_elem).find(".status").attr("title", v.status);
$(page_elem).find(".created").html(v.created);
$(page_elem).find(".title").html(v.meta.title);
$(page_elem).find(".description").html(v.meta.description);
elem.append(page_elem); // (*)
})
}
})
}
function goToPage(id) {
history.pushState({"page": id}, null, id);
page(id);
}
function page(id) {
$.ajax({
url: "/api/v1/pages/" + id, success: function (data, status, xhr) {
if (status !== "success") {
gotError(status);
return;
}
let elem = document.getElementById("data");
elem.innerHTML = "";
let page_elem = page_tmpl.content.cloneNode(true);
$(page_elem).find("#page_title").html(data.meta.title);
$(page_elem).find("#page_description").html(data.meta.description);
$(page_elem).find("#page_url").html(data.url);
data.results.forEach(function (result) {
let result_elem = result_tmpl.content.cloneNode(true);
$(result_elem).find(".format").html(result.format);
if (result.error !== "" && result.error !== undefined) {
$(result_elem).find(".format").addClass("error");
$(result_elem).find(".result_link").html("⚠");
$(result_elem).find(".result_link").attr("title", result.error);
} else {
result.files.forEach(function (file) {
$(result_elem).find(".result_link").attr("onclick", "window.open('/api/v1/pages/" + data.id + "/file/" + file.id + "', '_blank');");
$(result_elem).find(".result_link").html(file.name);
})
}
$(page_elem).find("#results").append(result_elem);
})
elem.append(page_elem); // (*)
}
})
}
function gotError(err) {
console.log(err);
}
document.addEventListener("DOMContentLoaded", function () {
$("#site_title").html("WebArchive " + window.location.hostname);
document.title = "WebArchive " + window.location.hostname;
if (window.location.pathname.endsWith("/")) {
index();
} else {
page(window.location.pathname.slice(1));
}
});
window.addEventListener('popstate', function (event) {
if (event.state === null) {
index();
} else {
page(event.state.page);
}
});

61
ui/basic/style.css Normal file

File diff suppressed because one or more lines are too long

8
ui/embed.go Normal file
View File

@@ -0,0 +1,8 @@
package ui
import (
"embed"
)
//go:embed */*.html */*.css */*.js
var StaticFiles embed.FS