From e27fdabf78b8a2d03c51c76971fe3e6ce1937d7b Mon Sep 17 00:00:00 2001 From: derfenix Date: Thu, 16 Nov 2023 22:22:48 +0300 Subject: [PATCH] Fix page meta retrieve --- adapters/processors/processors.go | 23 ++++++++++++++++++++++- adapters/processors/processors_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 adapters/processors/processors_test.go diff --git a/adapters/processors/processors.go b/adapters/processors/processors.go index bab3c90..f17aff8 100644 --- a/adapters/processors/processors.go +++ b/adapters/processors/processors.go @@ -129,8 +129,29 @@ func (p *Processors) GetMeta(ctx context.Context, url string) (entity.Meta, erro return entity.Meta{}, fmt.Errorf("parse response body: %w", err) } + var fc *html.Node + for fc = htmlNode.FirstChild; fc != nil && fc.Data != "html"; fc = fc.NextSibling { + } + + if fc == nil { + return entity.Meta{}, fmt.Errorf("failed to find html tag") + } + + fc = fc.NextSibling + if fc == nil { + return entity.Meta{}, fmt.Errorf("failed to find html tag") + } + + for fc = fc.FirstChild; fc != nil && fc.Data != "head"; fc = fc.NextSibling { + fmt.Println(fc.Data) + } + + if fc == nil { + return entity.Meta{}, fmt.Errorf("failed to find html tag") + } + meta := entity.Meta{} - getMetaData(htmlNode, &meta) + getMetaData(fc, &meta) meta.Encoding = encodingFromHeader(response.Header) return meta, nil diff --git a/adapters/processors/processors_test.go b/adapters/processors/processors_test.go new file mode 100644 index 0000000..d226477 --- /dev/null +++ b/adapters/processors/processors_test.go @@ -0,0 +1,26 @@ +package processors + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/derfenix/webarchive/config" +) + +func TestProcessors_GetMeta(t *testing.T) { + t.Parallel() + + ctx := context.Background() + cfg, err := config.NewConfig(ctx) + require.NoError(t, err) + + procs, err := NewProcessors(cfg) + require.NoError(t, err) + + meta, err := procs.GetMeta(ctx, "https://habr.com/ru/companies/wirenboard/articles/722718/") + require.NoError(t, err) + assert.Equal(t, "Сколько стоит умный дом? Рассказываю, как строил свой и что получилось за 1000 руб./м² / Хабр", meta.Title) +}