10 Commits

Author SHA1 Message Date
f47dbefb67 web ui: index and basic details page, api refactoring 2023-04-04 23:02:02 +03:00
2a8b94136f web ui: basic logic 2023-04-04 16:24:35 +03:00
790eece361 Add roadmap item 2023-04-04 08:47:15 +03:00
f517a0e3a6 Update LICENSE.txt 2023-04-03 21:50:59 +03:00
dbb6d6f968 Improve docker-compose.yaml 2023-04-03 20:36:50 +03:00
a4f9022f40 Use prebuilt image in docker-compose.yaml 2023-04-03 19:12:43 +03:00
0a6b247765 Fix github actions 2023-04-03 18:40:34 +03:00
b7533d407f Hide pdf processor test with tag 2023-04-03 16:56:11 +03:00
7d4056e312 Update github actions 2023-04-03 16:54:30 +03:00
695021dae6 Add github actions 2023-04-03 16:50:32 +03:00
25 changed files with 1014 additions and 46 deletions

47
.github/workflows/release.yaml vendored Normal file
View File

@@ -0,0 +1,47 @@
name: release
on:
push:
tags:
- 'v*'
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: 1.20.x
- name: Checkout code
uses: actions/checkout@v3
- name: Setup Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v2
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: ghcr.io/derfenix/webarchive
- name: Build and push
uses: docker/build-push-action@v4
with:
push: true
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
tags: |
ghcr.io/derfenix/webarchive:latest
ghcr.io/derfenix/webarchive:${{github.ref_name}}
labels: ${{ steps.meta.outputs.labels }}

58
.github/workflows/test.yaml vendored Normal file
View File

@@ -0,0 +1,58 @@
name: test
on:
pull_request:
push:
branches:
- master
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: 1.20.x
- name: Checkout code
uses: actions/checkout@v3
- name: go mod package cache
uses: actions/cache@v3
with:
path: |
~/.cache/go-build
~/go/pkg/mod
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-
- name: Tests
run: go test ./...
- name: golangci-lint
uses: golangci/golangci-lint-action@v3
with:
# Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
version: latest
# Optional: working directory, useful for monorepos
# working-directory: somedir
# Optional: golangci-lint command line arguments.
# args: --issues-exit-code=0
# Optional: show only new issues if it's a pull request. The default value is `false`.
# only-new-issues: true
# Optional: if set to true then the all caching functionality will be complete disabled,
# takes precedence over all other caching options.
# skip-cache: true
# Optional: if set to true then the action don't cache or restore ~/go/pkg.
# skip-pkg-cache: true
# Optional: if set to true then the action don't cache or restore ~/.cache/go-build.
# skip-build-cache: true

14
.idea/webResources.xml generated Normal file
View File

@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="WebResourcesPaths">
<contentEntries>
<entry url="file://$PROJECT_DIR$">
<entryData>
<resourceRoots>
<path value="file://$PROJECT_DIR$/ui" />
</resourceRoots>
</entryData>
</entry>
</contentEntries>
</component>
</project>

View File

@@ -2,14 +2,10 @@ Copyright (c) 2023, derfenix <derfenix@gmail.com> All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3) All advertising materials mentioning features or use of this software must display the following acknowledgement:
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
"This product includes software developed by the University of California, Berkeley and its contributors."
4) Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -24,13 +24,17 @@ variables:
* **LOGGING_DEBUG** — enable debug logs (default `false`)
* **API**
* **API_ADDRESS** — address the API server will listen (default `0.0.0.0:5001`)
* **UI**
* **UI_ENABLED** — Enable builtin web UI (default `true`)
* **UI_PREFIX** — Prefix for the web UI (default `/`)
* **UI_THEME** — UI theme name (default `basic`). No other values available yet
* **PDF**
* **PDF_LANDSCAPE** — use landscape page orientation instead of portrait (default `false`)
* **PDF_GRAYSCALE** — use grayscale filter for the output pdf (default `false`)
* **PDF_MEDIA_PRINT** — use media type `print` for the request (default `true`)
* **PDF_ZOOM** — zoom page (default `1.0` i.e. no actual zoom)
* **PDF_VIEWPORT** — use specified viewport value (default `1920x1080`)
* **PDF_DPI** — use specified DPI value for the output pdf (default `300`)
* **PDF_VIEWPORT** — use specified viewport value (default `1280x720`)
* **PDF_DPI** — use specified DPI value for the output pdf (default `150`)
* **PDF_FILENAME** — use specified name for output pdf file (default `page.pdf`)
@@ -60,7 +64,7 @@ docker compose up -d webarchive
### 2. Add a page
```shell
curl -X POST --location "http://localhost:5001/pages" \
curl -X POST --location "http://localhost:5001/api/v1/pages" \
-H "Content-Type: application/json" \
-d "{
\"url\": \"https://github.com/wkhtmltopdf/wkhtmltopdf/issues/1937\",
@@ -75,13 +79,13 @@ or
```shell
curl -X POST --location \
"http://localhost:5001/pages?url=https%3A%2F%2Fgithub.com%2Fwkhtmltopdf%2Fwkhtmltopdf%2Fissues%2F1937&formats=pdf%2Cheaders&description=Foo+Bar"
"http://localhost:5001/api/v1/pages?url=https%3A%2F%2Fgithub.com%2Fwkhtmltopdf%2Fwkhtmltopdf%2Fissues%2F1937&formats=pdf%2Cheaders&description=Foo+Bar"
```
### 3. Get the page's info
```shell
curl -X GET --location "http://localhost:5001/pages/$page_id" | jq .
curl -X GET --location "http://localhost:5001/api/v1/pages/$page_id" | jq .
```
where `$page_id` — value of the `id` field from previous command response.
If `status` field in response is `success` (or `with_errors`) - the `results` field
@@ -90,7 +94,7 @@ will contain all processed formats with ids of the stored files.
### 4. Open file in browser
```shell
xdg-open "http://localhost:5001/pages/$page_id/file/$file_id"
xdg-open "http://localhost:5001/api/v1/pages/$page_id/file/$file_id"
```
Where `$page_id` — value of the `id` field from previous command response, and
`$file_id` — the id of interesting file.
@@ -98,7 +102,7 @@ Where `$page_id` — value of the `id` field from previous command response, an
### 5. List all stored pages
```shell
curl -X GET --location "http://localhost:5001/pages" | jq .
curl -X GET --location "http://localhost:5001/api/v1/pages" | jq .
```
## Roadmap
@@ -111,3 +115,4 @@ curl -X GET --location "http://localhost:5001/pages" | jq .
- [ ] Optional authentication
- [ ] Multi-user access
- [ ] Support SQL database with or without separate files storage
- [ ] Tags/Categories

View File

@@ -47,6 +47,7 @@ func (p *PDF) Process(_ context.Context, url string) ([]entity.File, error) {
page.FooterFontSize.Set(10)
page.Zoom.Set(p.cfg.Zoom)
page.ViewportSize.Set(p.cfg.Viewport)
page.NoBackground.Set(true)
gen.AddPage(page)

View File

@@ -1,3 +1,5 @@
//go:build integration
package processors
import (

View File

@@ -8,6 +8,8 @@ import (
"net/http/cookiejar"
"time"
"golang.org/x/net/html"
"github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/entity"
)
@@ -52,6 +54,7 @@ func NewProcessors(cfg config.Config) (*Processors, error) {
}
procs := Processors{
client: httpClient,
processors: map[entity.Format]processor{
entity.FormatHeaders: NewHeaders(httpClient),
entity.FormatPDF: NewPDF(cfg.PDF),
@@ -64,6 +67,7 @@ func NewProcessors(cfg config.Config) (*Processors, error) {
type Processors struct {
processors map[entity.Format]processor
client *http.Client
}
func (p *Processors) Process(ctx context.Context, format entity.Format, url string) entity.Result {
@@ -93,3 +97,62 @@ func (p *Processors) OverrideProcessor(format entity.Format, proc processor) err
return nil
}
func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return entity.Meta{}, fmt.Errorf("new request: %w", err)
}
response, err := p.client.Do(req)
if err != nil {
return entity.Meta{}, fmt.Errorf("do request: %w", err)
}
if response.StatusCode != http.StatusOK {
return entity.Meta{}, fmt.Errorf("want status 200, got %d", response.StatusCode)
}
if response.Body == nil {
return entity.Meta{}, fmt.Errorf("empty response body")
}
defer func() {
_ = response.Body.Close()
}()
htmlNode, err := html.Parse(response.Body)
if err != nil {
return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
}
meta := entity.Meta{}
getMetaData(htmlNode, &meta)
return meta, nil
}
func getMetaData(n *html.Node, meta *entity.Meta) {
if n == nil {
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "title" {
meta.Title = c.FirstChild.Data
}
if c.Type == html.ElementNode && c.Data == "meta" {
attrs := make(map[string]string)
for _, attr := range c.Attr {
attrs[attr.Key] = attr.Val
}
name, ok := attrs["name"]
if ok && name == "description" {
meta.Description = attrs["content"]
}
}
getMetaData(c, meta)
}
}

View File

@@ -64,18 +64,18 @@ func (p *Page) GetFile(_ context.Context, pageID, fileID uuid.UUID) (*entity.Fil
return file, nil
}
func (p *Page) Save(_ context.Context, site *entity.Page) error {
func (p *Page) Save(_ context.Context, page *entity.Page) error {
if p.db.IsClosed() {
return ErrDBClosed
}
marshaled, err := marshal(site)
marshaled, err := marshal(page)
if err != nil {
return fmt.Errorf("marshal data: %w", err)
}
if err := p.db.Update(func(txn *badger.Txn) error {
if err := txn.Set(p.key(site), marshaled); err != nil {
if err := txn.Set(p.key(page), marshaled); err != nil {
return fmt.Errorf("put data: %w", err)
}
@@ -151,6 +151,64 @@ func (p *Page) ListAll(ctx context.Context) ([]*entity.Page, error) {
Formats: page.Formats,
Version: page.Version,
Status: page.Status,
Meta: page.Meta,
})
}
return nil
})
if err != nil {
return nil, fmt.Errorf("view: %w", err)
}
sort.Slice(pages, func(i, j int) bool {
return pages[i].Created.After(pages[j].Created)
})
return pages, nil
}
func (p *Page) ListUnprocessed(ctx context.Context) ([]*entity.Page, error) {
pages := make([]*entity.Page, 0, 100)
err := p.db.View(func(txn *badger.Txn) error {
iterator := txn.NewIterator(badger.DefaultIteratorOptions)
defer iterator.Close()
for iterator.Seek(p.prefix); iterator.ValidForPrefix(p.prefix); iterator.Next() {
if err := ctx.Err(); err != nil {
return fmt.Errorf("context canceled: %w", err)
}
var page entity.Page
err := iterator.Item().Value(func(val []byte) error {
if err := unmarshal(val, &page); err != nil {
return fmt.Errorf("unmarshal: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("get item: %w", err)
}
if page.Status != entity.StatusProcessing {
continue
}
pages = append(pages, &entity.Page{
ID: page.ID,
URL: page.URL,
Description: page.Description,
Created: page.Created,
Formats: page.Formats,
Version: page.Version,
Status: page.Status,
Meta: page.Meta,
})
}

View File

@@ -4,7 +4,7 @@ info:
description: API description in Markdown.
version: 1.0.0
servers:
- url: 'https://api.example.com'
- url: 'https://api.example.com/api/v1'
paths:
/pages:
get:
@@ -183,12 +183,25 @@ components:
$ref: '#/components/schemas/format'
status:
$ref: '#/components/schemas/status'
meta:
type: object
properties:
title:
type: string
description:
type: string
error:
type: string
required:
- title
- description
required:
- id
- url
- formats
- status
- created
- meta
result:
type: object
properties:

View File

@@ -534,14 +534,20 @@ func (s *Page) encodeFields(e *jx.Encoder) {
e.FieldStart("status")
s.Status.Encode(e)
}
{
e.FieldStart("meta")
s.Meta.Encode(e)
}
}
var jsonFieldsNameOfPage = [5]string{
var jsonFieldsNameOfPage = [6]string{
0: "id",
1: "url",
2: "created",
3: "formats",
4: "status",
5: "meta",
}
// Decode decodes Page from json.
@@ -617,6 +623,16 @@ func (s *Page) Decode(d *jx.Decoder) error {
}(); err != nil {
return errors.Wrap(err, "decode field \"status\"")
}
case "meta":
requiredBitSet[0] |= 1 << 5
if err := func() error {
if err := s.Meta.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"meta\"")
}
default:
return d.Skip()
}
@@ -627,7 +643,7 @@ func (s *Page) Decode(d *jx.Decoder) error {
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00011111,
0b00111111,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
@@ -673,6 +689,138 @@ func (s *Page) UnmarshalJSON(data []byte) error {
return s.Decode(d)
}
// Encode implements json.Marshaler.
func (s *PageMeta) Encode(e *jx.Encoder) {
e.ObjStart()
s.encodeFields(e)
e.ObjEnd()
}
// encodeFields encodes fields.
func (s *PageMeta) encodeFields(e *jx.Encoder) {
{
e.FieldStart("title")
e.Str(s.Title)
}
{
e.FieldStart("description")
e.Str(s.Description)
}
{
if s.Error.Set {
e.FieldStart("error")
s.Error.Encode(e)
}
}
}
var jsonFieldsNameOfPageMeta = [3]string{
0: "title",
1: "description",
2: "error",
}
// Decode decodes PageMeta from json.
func (s *PageMeta) Decode(d *jx.Decoder) error {
if s == nil {
return errors.New("invalid: unable to decode PageMeta to nil")
}
var requiredBitSet [1]uint8
if err := d.ObjBytes(func(d *jx.Decoder, k []byte) error {
switch string(k) {
case "title":
requiredBitSet[0] |= 1 << 0
if err := func() error {
v, err := d.Str()
s.Title = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"title\"")
}
case "description":
requiredBitSet[0] |= 1 << 1
if err := func() error {
v, err := d.Str()
s.Description = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"description\"")
}
case "error":
if err := func() error {
s.Error.Reset()
if err := s.Error.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"error\"")
}
default:
return d.Skip()
}
return nil
}); err != nil {
return errors.Wrap(err, "decode PageMeta")
}
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00000011,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
//
// If XOR result is not zero, result is not equal to expected, so some fields are missed.
// Bits of fields which would be set are actually bits of missed fields.
missed := bits.OnesCount8(result)
for bitN := 0; bitN < missed; bitN++ {
bitIdx := bits.TrailingZeros8(result)
fieldIdx := i*8 + bitIdx
var name string
if fieldIdx < len(jsonFieldsNameOfPageMeta) {
name = jsonFieldsNameOfPageMeta[fieldIdx]
} else {
name = strconv.Itoa(fieldIdx)
}
failures = append(failures, validate.FieldError{
Name: name,
Error: validate.ErrFieldRequired,
})
// Reset bit.
result &^= 1 << bitIdx
}
}
}
if len(failures) > 0 {
return &validate.Error{Fields: failures}
}
return nil
}
// MarshalJSON implements stdjson.Marshaler.
func (s *PageMeta) MarshalJSON() ([]byte, error) {
e := jx.Encoder{}
s.Encode(&e)
return e.Bytes(), nil
}
// UnmarshalJSON implements stdjson.Unmarshaler.
func (s *PageMeta) UnmarshalJSON(data []byte) error {
d := jx.DecodeBytes(data)
return s.Decode(d)
}
// Encode implements json.Marshaler.
func (s *PageWithResults) Encode(e *jx.Encoder) {
e.ObjStart()
@@ -711,6 +859,11 @@ func (s *PageWithResults) encodeFields(e *jx.Encoder) {
e.FieldStart("status")
s.Status.Encode(e)
}
{
e.FieldStart("meta")
s.Meta.Encode(e)
}
{
e.FieldStart("results")
@@ -722,13 +875,14 @@ func (s *PageWithResults) encodeFields(e *jx.Encoder) {
}
}
var jsonFieldsNameOfPageWithResults = [6]string{
var jsonFieldsNameOfPageWithResults = [7]string{
0: "id",
1: "url",
2: "created",
3: "formats",
4: "status",
5: "results",
5: "meta",
6: "results",
}
// Decode decodes PageWithResults from json.
@@ -804,8 +958,18 @@ func (s *PageWithResults) Decode(d *jx.Decoder) error {
}(); err != nil {
return errors.Wrap(err, "decode field \"status\"")
}
case "results":
case "meta":
requiredBitSet[0] |= 1 << 5
if err := func() error {
if err := s.Meta.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"meta\"")
}
case "results":
requiredBitSet[0] |= 1 << 6
if err := func() error {
s.Results = make([]Result, 0)
if err := d.Arr(func(d *jx.Decoder) error {
@@ -832,7 +996,7 @@ func (s *PageWithResults) Decode(d *jx.Decoder) error {
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00111111,
0b01111111,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
@@ -878,6 +1042,138 @@ func (s *PageWithResults) UnmarshalJSON(data []byte) error {
return s.Decode(d)
}
// Encode implements json.Marshaler.
func (s *PageWithResultsMeta) Encode(e *jx.Encoder) {
e.ObjStart()
s.encodeFields(e)
e.ObjEnd()
}
// encodeFields encodes fields.
func (s *PageWithResultsMeta) encodeFields(e *jx.Encoder) {
{
e.FieldStart("title")
e.Str(s.Title)
}
{
e.FieldStart("description")
e.Str(s.Description)
}
{
if s.Error.Set {
e.FieldStart("error")
s.Error.Encode(e)
}
}
}
var jsonFieldsNameOfPageWithResultsMeta = [3]string{
0: "title",
1: "description",
2: "error",
}
// Decode decodes PageWithResultsMeta from json.
func (s *PageWithResultsMeta) Decode(d *jx.Decoder) error {
if s == nil {
return errors.New("invalid: unable to decode PageWithResultsMeta to nil")
}
var requiredBitSet [1]uint8
if err := d.ObjBytes(func(d *jx.Decoder, k []byte) error {
switch string(k) {
case "title":
requiredBitSet[0] |= 1 << 0
if err := func() error {
v, err := d.Str()
s.Title = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"title\"")
}
case "description":
requiredBitSet[0] |= 1 << 1
if err := func() error {
v, err := d.Str()
s.Description = string(v)
if err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"description\"")
}
case "error":
if err := func() error {
s.Error.Reset()
if err := s.Error.Decode(d); err != nil {
return err
}
return nil
}(); err != nil {
return errors.Wrap(err, "decode field \"error\"")
}
default:
return d.Skip()
}
return nil
}); err != nil {
return errors.Wrap(err, "decode PageWithResultsMeta")
}
// Validate required fields.
var failures []validate.FieldError
for i, mask := range [1]uint8{
0b00000011,
} {
if result := (requiredBitSet[i] & mask) ^ mask; result != 0 {
// Mask only required fields and check equality to mask using XOR.
//
// If XOR result is not zero, result is not equal to expected, so some fields are missed.
// Bits of fields which would be set are actually bits of missed fields.
missed := bits.OnesCount8(result)
for bitN := 0; bitN < missed; bitN++ {
bitIdx := bits.TrailingZeros8(result)
fieldIdx := i*8 + bitIdx
var name string
if fieldIdx < len(jsonFieldsNameOfPageWithResultsMeta) {
name = jsonFieldsNameOfPageWithResultsMeta[fieldIdx]
} else {
name = strconv.Itoa(fieldIdx)
}
failures = append(failures, validate.FieldError{
Name: name,
Error: validate.ErrFieldRequired,
})
// Reset bit.
result &^= 1 << bitIdx
}
}
}
if len(failures) > 0 {
return &validate.Error{Fields: failures}
}
return nil
}
// MarshalJSON implements stdjson.Marshaler.
func (s *PageWithResultsMeta) MarshalJSON() ([]byte, error) {
e := jx.Encoder{}
s.Encode(&e)
return e.Bytes(), nil
}
// UnmarshalJSON implements stdjson.Unmarshaler.
func (s *PageWithResultsMeta) UnmarshalJSON(data []byte) error {
d := jx.DecodeBytes(data)
return s.Decode(d)
}
// Encode encodes Pages as json.
func (s Pages) Encode(e *jx.Encoder) {
unwrapped := []Page(s)

View File

@@ -324,6 +324,7 @@ type Page struct {
Created time.Time `json:"created"`
Formats []Format `json:"formats"`
Status Status `json:"status"`
Meta PageMeta `json:"meta"`
}
// GetID returns the value of ID.
@@ -351,6 +352,11 @@ func (s *Page) GetStatus() Status {
return s.Status
}
// GetMeta returns the value of Meta.
func (s *Page) GetMeta() PageMeta {
return s.Meta
}
// SetID sets the value of ID.
func (s *Page) SetID(val uuid.UUID) {
s.ID = val
@@ -376,17 +382,59 @@ func (s *Page) SetStatus(val Status) {
s.Status = val
}
// SetMeta sets the value of Meta.
func (s *Page) SetMeta(val PageMeta) {
s.Meta = val
}
func (*Page) addPageRes() {}
type PageMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Error OptString `json:"error"`
}
// GetTitle returns the value of Title.
func (s *PageMeta) GetTitle() string {
return s.Title
}
// GetDescription returns the value of Description.
func (s *PageMeta) GetDescription() string {
return s.Description
}
// GetError returns the value of Error.
func (s *PageMeta) GetError() OptString {
return s.Error
}
// SetTitle sets the value of Title.
func (s *PageMeta) SetTitle(val string) {
s.Title = val
}
// SetDescription sets the value of Description.
func (s *PageMeta) SetDescription(val string) {
s.Description = val
}
// SetError sets the value of Error.
func (s *PageMeta) SetError(val OptString) {
s.Error = val
}
// Merged schema.
// Ref: #/components/schemas/pageWithResults
type PageWithResults struct {
ID uuid.UUID `json:"id"`
URL string `json:"url"`
Created time.Time `json:"created"`
Formats []Format `json:"formats"`
Status Status `json:"status"`
Results []Result `json:"results"`
ID uuid.UUID `json:"id"`
URL string `json:"url"`
Created time.Time `json:"created"`
Formats []Format `json:"formats"`
Status Status `json:"status"`
Meta PageWithResultsMeta `json:"meta"`
Results []Result `json:"results"`
}
// GetID returns the value of ID.
@@ -414,6 +462,11 @@ func (s *PageWithResults) GetStatus() Status {
return s.Status
}
// GetMeta returns the value of Meta.
func (s *PageWithResults) GetMeta() PageWithResultsMeta {
return s.Meta
}
// GetResults returns the value of Results.
func (s *PageWithResults) GetResults() []Result {
return s.Results
@@ -444,6 +497,11 @@ func (s *PageWithResults) SetStatus(val Status) {
s.Status = val
}
// SetMeta sets the value of Meta.
func (s *PageWithResults) SetMeta(val PageWithResultsMeta) {
s.Meta = val
}
// SetResults sets the value of Results.
func (s *PageWithResults) SetResults(val []Result) {
s.Results = val
@@ -451,6 +509,42 @@ func (s *PageWithResults) SetResults(val []Result) {
func (*PageWithResults) getPageRes() {}
type PageWithResultsMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Error OptString `json:"error"`
}
// GetTitle returns the value of Title.
func (s *PageWithResultsMeta) GetTitle() string {
return s.Title
}
// GetDescription returns the value of Description.
func (s *PageWithResultsMeta) GetDescription() string {
return s.Description
}
// GetError returns the value of Error.
func (s *PageWithResultsMeta) GetError() OptString {
return s.Error
}
// SetTitle sets the value of Title.
func (s *PageWithResultsMeta) SetTitle(val string) {
s.Title = val
}
// SetDescription sets the value of Description.
func (s *PageWithResultsMeta) SetDescription(val string) {
s.Description = val
}
// SetError sets the value of Error.
func (s *PageWithResultsMeta) SetError(val OptString) {
s.Error = val
}
type Pages []Page
// Ref: #/components/schemas/result

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"net"
"net/http"
"strings"
"sync"
"time"
@@ -48,7 +49,8 @@ func NewApplication(cfg config.Config) (Application, error) {
worker := entity.NewWorker(workerCh, pageRepo, processor, log.Named("worker"))
server, err := openapi.NewServer(
rest.NewService(pageRepo, workerCh),
rest.NewService(pageRepo, workerCh, processor),
openapi.WithPathPrefix("/api/v1"),
openapi.WithMiddleware(
func(r middleware.Request, next middleware.Next) (middleware.Response, error) {
start := time.Now()
@@ -73,9 +75,25 @@ func NewApplication(cfg config.Config) (Application, error) {
return Application{}, fmt.Errorf("new rest server: %w", err)
}
var httpHandler http.Handler = server
if cfg.UI.Enabled {
ui := rest.NewUI(cfg.UI)
httpHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, "/api/") {
server.ServeHTTP(w, r)
return
}
ui.ServeHTTP(w, r)
})
}
httpServer := http.Server{
Addr: cfg.API.Address,
Handler: server,
Handler: httpHandler,
ReadTimeout: time.Second * 15,
ReadHeaderTimeout: time.Second * 5,
IdleTimeout: time.Second * 30,

View File

@@ -28,6 +28,7 @@ type Config struct {
DB DB `env:",prefix=DB_"`
Logging Logging `env:",prefix=LOGGING_"`
API API `env:",prefix=API_"`
UI UI `env:",prefix=UI_"`
PDF PDF `env:",prefix=PDF_"`
}
@@ -36,8 +37,8 @@ type PDF struct {
Grayscale bool `env:"GRAYSCALE,default=false"`
MediaPrint bool `env:"MEDIA_PRINT,default=true"`
Zoom float64 `env:"ZOOM,default=1"`
Viewport string `env:"VIEWPORT,default=1920x1080"`
DPI uint `env:"DPI,default=300"`
Viewport string `env:"VIEWPORT,default=1280x720"`
DPI uint `env:"DPI,default=150"`
Filename string `env:"FILENAME,default=page.pdf"`
}
@@ -45,6 +46,12 @@ type API struct {
Address string `env:"ADDRESS,default=0.0.0.0:5001"`
}
type UI struct {
Enabled bool `env:"ENABLED,default=true"`
Prefix string `env:"PREFIX,default=/"`
Theme string `env:"THEME,default=basic"`
}
type DB struct {
Path string `env:"PATH,default=./db"`
}

View File

@@ -2,14 +2,15 @@ version: "3"
services:
webarchive:
build:
dockerfile: ./Dockerfile
context: .
image: ghcr.io/derfenix/webarchive:latest
# build:
# dockerfile: ./Dockerfile
# context: .
environment:
LOGGING_DEBUG: true
API_ADDRESS: 0.0.0.0:5001
PDF_DPI: 300
DB_PATH: /db
LOGGING_DEBUG: "true"
API_ADDRESS: "0.0.0.0:5001"
PDF_DPI: "300"
DB_PATH: "/db"
volumes:
- ./db:/db
ports:

View File

@@ -11,6 +11,7 @@ import (
type Processor interface {
Process(ctx context.Context, format Format, url string) Result
GetMeta(ctx context.Context, url string) (Meta, error)
}
type Format uint8
@@ -37,6 +38,12 @@ const (
StatusWithErrors
)
type Meta struct {
Title string
Description string
Error string
}
func NewPage(url string, description string, formats ...Format) *Page {
return &Page{
ID: uuid.New(),
@@ -57,6 +64,7 @@ type Page struct {
Results Results
Version uint16
Status Status
Meta Meta
}
func (p *Page) SetProcessing() {

View File

@@ -9,6 +9,7 @@ import (
type Pages interface {
Save(ctx context.Context, page *Page) error
ListUnprocessed(ctx context.Context) ([]*Page, error)
}
func NewWorker(ch chan *Page, pages Pages, processor Processor, log *zap.Logger) *Worker {
@@ -27,6 +28,20 @@ func (w *Worker) Start(ctx context.Context, wg *sync.WaitGroup) {
w.log.Info("starting")
wg.Add(1)
go func() {
defer wg.Done()
unprocessed, err := w.pages.ListUnprocessed(ctx)
if err != nil {
w.log.Error("failed to get unprocessed pages", zap.Error(err))
} else {
for i := range unprocessed {
w.ch <- unprocessed[i]
}
}
}()
for {
select {
case <-ctx.Done():

View File

@@ -2,6 +2,7 @@ package rest
import (
"fmt"
"html"
"github.com/derfenix/webarchive/api/openapi"
"github.com/derfenix/webarchive/entity"
@@ -22,6 +23,11 @@ func PageToRestWithResults(page *entity.Page) openapi.PageWithResults {
return res
}(),
Status: StatusToRest(page.Status),
Meta: openapi.PageWithResultsMeta{
Title: html.EscapeString(page.Meta.Title),
Description: html.EscapeString(page.Meta.Description),
Error: openapi.NewOptString(page.Meta.Error),
},
Results: func() []openapi.Result {
results := make([]openapi.Result, len(page.Results.Results()))
@@ -65,6 +71,11 @@ func PageToRest(page *entity.Page) openapi.Page {
ID: page.ID,
URL: page.URL,
Created: page.Created,
Meta: openapi.PageMeta{
Title: html.EscapeString(page.Meta.Title),
Description: html.EscapeString(page.Meta.Description),
Error: openapi.NewOptString(page.Meta.Error),
},
Formats: func() []openapi.Format {
res := make([]openapi.Format, len(page.Formats))

View File

@@ -20,14 +20,19 @@ type Pages interface {
GetFile(ctx context.Context, pageID, fileID uuid.UUID) (*entity.File, error)
}
func NewService(sites Pages, ch chan *entity.Page) *Service {
return &Service{pages: sites, ch: ch}
func NewService(pages Pages, ch chan *entity.Page, processor entity.Processor) *Service {
return &Service{
pages: pages,
ch: ch,
processor: processor,
}
}
type Service struct {
openapi.UnimplementedHandler
pages Pages
ch chan *entity.Page
pages Pages
ch chan *entity.Page
processor entity.Processor
}
func (s *Service) GetPage(ctx context.Context, params openapi.GetPageParams) (openapi.GetPageRes, error) {
@@ -78,6 +83,13 @@ func (s *Service) AddPage(ctx context.Context, req openapi.OptAddPageReq, params
page := entity.NewPage(url, description, domainFormats...)
page.Status = entity.StatusProcessing
meta, err := s.processor.GetMeta(ctx, page.URL)
if err != nil {
page.Meta.Error = err.Error()
} else {
page.Meta = meta
}
if err := s.pages.Save(ctx, page); err != nil {
return nil, fmt.Errorf("save page: %w", err)
}

41
ports/rest/ui.go Normal file
View File

@@ -0,0 +1,41 @@
package rest
import (
"io/fs"
"net/http"
"strings"
"github.com/derfenix/webarchive/config"
"github.com/derfenix/webarchive/ui"
)
func NewUI(cfg config.UI) *UI {
return &UI{
prefix: cfg.Prefix,
theme: cfg.Theme,
}
}
type UI struct {
prefix string
theme string
}
func (u *UI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
serveRoot, err := fs.Sub(ui.StaticFiles, u.theme)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
return
}
if strings.HasPrefix(r.URL.Path, u.prefix) {
r.URL.Path = "/" + strings.TrimPrefix(r.URL.Path, u.prefix)
}
if !strings.HasPrefix(r.URL.Path, "/static") {
r.URL.Path = "/"
}
r.URL.Path = strings.TrimPrefix(r.URL.Path, "/static")
http.FileServer(http.FS(serveRoot)).ServeHTTP(w, r)
}

47
ui/basic/index.html Normal file
View File

@@ -0,0 +1,47 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>WebArchive</title>
<link rel="stylesheet" href="/static/style.css">
<script src="/static/lib.js"></script>
<script src="/static/main.js"></script>
</head>
<body>
<template id="pages_tmpl">
<div class="page_item">
<a class="url link"><span class="title"></span><span class="status"></span></a>
<div class="description"></div>
<div class="created"></div>
<hr>
</div>
</template>
<template id="page_tmpl">
<a onclick="history.back()" class="link">Back</a>
<div class="page">
<h2 id="page_title"></h2>
<h3 id="page_description"></h3>
<h5 id="page_url" class="link" onclick="window.open(this.innerHTML, '_blank')"></h5>
<h4>Results</h4>
<div id="results"></div>
</div>
</template>
<template id="result_tmpl">
<div class="result_item">
<span class="format"></span>
<span class="result_link link"></span>
</div>
</template>
<h1 id="site_title"></h1>
<div id="data">
None
</div>
</body>
</html>

2
ui/basic/lib.js Normal file

File diff suppressed because one or more lines are too long

90
ui/basic/main.js Normal file
View File

@@ -0,0 +1,90 @@
function index() {
$.ajax({
url: "/api/v1/pages", success: function (data, status, xhr) {
if (status !== "success") {
gotError(status);
return;
}
let elem = document.getElementById("data");
elem.innerHTML = "";
// elem.attachShadow({mode: 'open'});
data.forEach(function (v) {
let page_elem = pages_tmpl.content.cloneNode(true);
$(page_elem).find(".url").attr("onclick", "goToPage('" + v.id + "');");
$(page_elem).find(".status").addClass(v.status);
$(page_elem).find(".status").attr("title", v.status);
$(page_elem).find(".created").html(v.created);
$(page_elem).find(".title").html(v.meta.title);
$(page_elem).find(".description").html(v.meta.description);
elem.append(page_elem); // (*)
})
}
})
}
function goToPage(id) {
history.pushState({"page": id}, null, id);
page(id);
}
function page(id) {
$.ajax({
url: "/api/v1/pages/" + id, success: function (data, status, xhr) {
if (status !== "success") {
gotError(status);
return;
}
let elem = document.getElementById("data");
elem.innerHTML = "";
let page_elem = page_tmpl.content.cloneNode(true);
$(page_elem).find("#page_title").html(data.meta.title);
$(page_elem).find("#page_description").html(data.meta.description);
$(page_elem).find("#page_url").html(data.url);
data.results.forEach(function (result) {
let result_elem = result_tmpl.content.cloneNode(true);
$(result_elem).find(".format").html(result.format);
if (result.error !== "" && result.error !== undefined) {
$(result_elem).find(".format").addClass("error");
$(result_elem).find(".result_link").html("⚠");
$(result_elem).find(".result_link").attr("title", result.error);
} else {
result.files.forEach(function (file) {
$(result_elem).find(".result_link").attr("onclick", "window.open('/api/v1/pages/" + data.id + "/file/" + file.id + "', '_blank');");
$(result_elem).find(".result_link").html(file.name);
})
}
$(page_elem).find("#results").append(result_elem);
})
elem.append(page_elem); // (*)
}
})
}
function gotError(err) {
console.log(err);
}
document.addEventListener("DOMContentLoaded", function () {
$("#site_title").html("WebArchive " + window.location.hostname);
document.title = "WebArchive " + window.location.hostname;
if (window.location.pathname.endsWith("/")) {
index();
} else {
page(window.location.pathname.slice(1));
}
});
window.addEventListener('popstate', function (event) {
if (event.state === null) {
index();
} else {
page(event.state.page);
}
});

61
ui/basic/style.css Normal file

File diff suppressed because one or more lines are too long

8
ui/embed.go Normal file
View File

@@ -0,0 +1,8 @@
package ui
import (
"embed"
)
//go:embed */*.html */*.css */*.js
var StaticFiles embed.FS