Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generic attributes #120

Merged
merged 2 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 68 additions & 12 deletions ad.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
package main

import (
"bufio"
"log/slog"
"strings"
"time"
Expand All @@ -31,10 +32,12 @@ type Ad struct {
Title string `goquery:"h1"`
Slug string
ID string
Details []string `goquery:".addetailslist--detail--value,text"`
Condition string // post processed from details
Type string // post processed from details
Color string // post processed from details
Details string `goquery:".addetailslist--detail,text"`
Attributes map[string]string // processed afterwards
Condition string // post processed from details for backward compatibility
Type string // post processed from details for backward compatibility
Color string // post processed from details for backward compatibility
Material string // post processed from details for backward compatibility
Category string
CategoryTree []string `goquery:".breadcrump-link,text"`
Price string `goquery:"h2#viewad-price"`
Expand All @@ -53,19 +56,11 @@ func (ad *Ad) LogValue() slog.Value {
slog.Int("imagecount", len(ad.Images)),
slog.Int("bodysize", len(ad.Text)),
slog.String("categorytree", strings.Join(ad.CategoryTree, "+")),
slog.String("condition", ad.Condition),
slog.String("created", ad.Created),
slog.String("expire", ad.Expire),
)
}

// static set of conditions available, used for post processing details
var CONDITIONS = []string{"Neu", "Gut", "Sehr Gut", "In Ordnung"}
var COLORS = []string{"Beige", "Blau", "Braun", "Bunt", "Burgunderrot",
"Creme", "Gelb", "Gold", "Grau", "Grün", "Holz", "Khaki", "Lavelndel",
"Lila", "Orange", "Pink", "Print", "Rot", "Schwarz", "Silber",
"Transparent", "Türkis", "Weiß", "Sonstige"}

// check for completeness. I erected these fields to be mandatory
// (though I really don't know if they really are). I consider images
// and meta optional. So, if either of the checked fields here is
Expand All @@ -90,3 +85,64 @@ func (ad *Ad) CalculateExpire() {
}
}
}

/*
Decode attributes like color or condition. See
https://github.com/TLINDEN/kleingebaeck/issues/117
for more details. In short: the HTML delivered by
kleinanzeigen.de has no css attribute for the keys
so we cannot extract key=>value mappings of the
ad details but have to parse them manually.

The ad.Details member contains this after goq run:

Art

Weitere Kinderzimmermöbel

Farbe
Holz

Zustand
In Ordnung

We parse this into ad.Attributes and fill in some
static members for backward compatibility reasons.
*/
func (ad *Ad) DecodeAttributes() {
rd := strings.NewReader(ad.Details)
scanner := bufio.NewScanner(rd)

isattr := true
attr := ""
attrmap := map[string]string{}

for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())

if line == "" {
continue
}

if isattr {
attr = line
} else {
attrmap[attr] = line
}

isattr = !isattr
}

ad.Attributes = attrmap

switch {
case Exists(ad.Attributes, "Zustand"):
ad.Condition = ad.Attributes["Zustand"]
case Exists(ad.Attributes, "Farbe"):
ad.Color = ad.Attributes["Farbe"]
case Exists(ad.Attributes, "Art"):
ad.Type = ad.Attributes["Type"]
case Exists(ad.Attributes, "Material"):
ad.Material = ad.Attributes["Material"]
}
}
10 changes: 9 additions & 1 deletion config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,19 @@ import (
)

const (
VERSION string = "0.3.15"
VERSION string = "0.3.16"
Baseuri string = "https://www.kleinanzeigen.de"
Listuri string = "/s-bestandsliste.html"
Defaultdir string = "."

/*
Also possible: loop through .Attributes:

DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" +
"Category: {{.Category}}\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\n{{ end }}" +
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"

*/
DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" +
"Category: {{.Category}}\nCondition: {{.Condition}}\nType: {{.Type}}\nColor: {{.Color}}\n" +
"Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n"
Expand Down
40 changes: 27 additions & 13 deletions example.conf
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,31 @@ loglevel = "verbose"
# create it. must be a quoted string.
outdir = "test"

# template for stored adlistings. To enable it, remove the comment
# chars up until the last #"""
#template="""
#Title: {{.Title}}
#Price: {{.Price}}
#Id: {{.Id}}
#Category: {{.Category}}
#Condition: {{.Condition}}
#Type: {{.Type}}
#Created: {{.Created}}

#{{.Text}}
# """
# template for stored adlistings.
template="""
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
Condition: {{.Condition}}
Type: {{.Type}}
Created: {{.Created}}

{{.Text}}
"""

# Ads may contain more attributes than just the Condition. To print
# all attributes, loop over all of them:

template="""
Title: {{.Title}}
Price: {{.Price}}
Id: {{.Id}}
Category: {{.Category}}
{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}
{{ end }}
Type: {{.Type}}
Created: {{.Created}}

{{.Text}}
"""
6 changes: 2 additions & 4 deletions kleingebaeck.1
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "KLEINGEBAECK 1"
.TH KLEINGEBAECK 1 "2025-02-06" "1" "User Commands"
.TH KLEINGEBAECK 1 "2025-02-10" "1" "User Commands"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
Expand Down Expand Up @@ -174,7 +174,7 @@ well. We use \s-1TOML\s0 as our configuration language. See
.PP
Format is pretty simple:
.PP
.Vb 10
.Vb 11
\& user = 1010101
\& loglevel = verbose
\& outdir = "test"
Expand All @@ -185,8 +185,6 @@ Format is pretty simple:
\& Id: {{.ID}}
\& Category: {{.Category}}
\& Condition: {{.Condition}}
\& Type: {{.Type}}
\& Color: {{.Color}}
\& Created: {{.Created}}
\&
\& {{.Text}}
Expand Down
2 changes: 0 additions & 2 deletions kleingebaeck.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ CONFIGURATION
Id: {{.ID}}
Category: {{.Category}}
Condition: {{.Condition}}
Type: {{.Type}}
Color: {{.Color}}
Created: {{.Created}}

{{.Text}}
Expand Down
2 changes: 0 additions & 2 deletions kleingebaeck.pod
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ Format is pretty simple:
Id: {{.ID}}
Category: {{.Category}}
Condition: {{.Condition}}
Type: {{.Type}}
Color: {{.Color}}
Created: {{.Created}}

{{.Text}}
Expand Down
12 changes: 12 additions & 0 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "second-ad",
Condition: "Gut",
Color: "Lila",
Type: "Schoki",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
Expand All @@ -294,6 +296,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "third-ad",
Condition: "In Ordnung",
Color: "Blau",
Type: "Auto",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
Expand All @@ -305,6 +309,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "fourth-ad",
Condition: "Neu",
Color: "Rot",
Type: "Spielzeut",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
Expand All @@ -316,6 +322,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "fifth-ad",
Condition: "Sehr Gut",
Color: "Braun",
Type: "Parteibuch",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
Expand All @@ -327,6 +335,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "sixth-ad",
Condition: "Sehr Gut",
Color: "Silber",
Type: "Ring",
Created: "Yesterday",
Images: []string{"t/1.jpg", "t/2.jpg"},
},
Expand All @@ -338,6 +348,8 @@ var adsrc = []AdConfig{
Text: "Thing to sale",
Slug: "seventh-ad",
Condition: "Sehr Gut",
Color: "Gelpb",
Type: "Schmuck",
Created: "Yesterday",
Images: []string{"t/1.png", "t/1.gif", "t/1.webp", "t/1.jpg"},
},
Expand Down
13 changes: 1 addition & 12 deletions scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"fmt"
"log/slog"
"path/filepath"
"slices"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -125,17 +124,7 @@ func ScrapeAd(fetch *Fetcher, uri string) error {
return fmt.Errorf("could not extract ad data from page, got empty struct")
}

for _, detail := range advertisement.Details {
switch {
case slices.Contains(CONDITIONS, detail):
advertisement.Condition = detail
case slices.Contains(COLORS, detail):
advertisement.Color = detail
default:
advertisement.Type = detail
}
}

advertisement.DecodeAttributes()
advertisement.CalculateExpire()

// prepare ad dir name
Expand Down
Loading