Fix page meta retrieve

This commit is contained in:
2023-11-16 22:22:48 +03:00
parent 3147a0b683
commit e27fdabf78
2 changed files with 48 additions and 1 deletions

View File

@@ -129,8 +129,29 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro
return entity.Meta{}, fmt.Errorf("parse response body: %w", err)
}
var fc *html.Node
for fc = htmlNode.FirstChild; fc != nil && fc.Data != "html"; fc = fc.NextSibling {
}
if fc == nil {
return entity.Meta{}, fmt.Errorf("failed to find html tag")
}
fc = fc.NextSibling
if fc == nil {
return entity.Meta{}, fmt.Errorf("failed to find html tag")
}
for fc = fc.FirstChild; fc != nil && fc.Data != "head"; fc = fc.NextSibling {
fmt.Println(fc.Data)
}
if fc == nil {
return entity.Meta{}, fmt.Errorf("failed to find html tag")
}
meta := entity.Meta{}
getMetaData(htmlNode, &meta)
getMetaData(fc, &meta)
meta.Encoding = encodingFromHeader(response.Header)
return meta, nil

View File

@@ -0,0 +1,26 @@
package processors
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/derfenix/webarchive/config"
)
func TestProcessors_GetMeta(t *testing.T) {
t.Parallel()
ctx := context.Background()
cfg, err := config.NewConfig(ctx)
require.NoError(t, err)
procs, err := NewProcessors(cfg)
require.NoError(t, err)
meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/")
require.NoError(t, err)
assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title)
}