74 lines
1.6 KiB
Go
74 lines
1.6 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
type Article struct {
|
|
Title string
|
|
Link string
|
|
Comments int
|
|
CommentsLink string
|
|
}
|
|
|
|
func parseArticles(htmlContent string) ([]Article, error) {
|
|
var articles []Article
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc.Find("tr.athing").Each(func(i int, s *goquery.Selection) {
|
|
title := s.Find("td.title > span.titleline > a").Text()
|
|
link, _ := s.Find("td.title > span.titleline > a").Attr("href")
|
|
commentText := s.Next().Find("a[href^='item']").Last().Text()
|
|
commentsCount, err := extractNumberFromString(commentText)
|
|
commentsLink := s.Next().Find("a[href^='item']").Last().AttrOr("href", "")
|
|
if err != nil {
|
|
commentsCount = 0
|
|
}
|
|
|
|
article := Article{
|
|
Title: title,
|
|
Link: link,
|
|
Comments: commentsCount,
|
|
CommentsLink: commentsLink,
|
|
}
|
|
|
|
articles = append(articles, article)
|
|
})
|
|
|
|
return articles, nil
|
|
}
|
|
|
|
func extractCommentsCount(s *goquery.Selection) (int, error) {
|
|
// find the second a[href^='item'] element
|
|
|
|
commentText := s.Next().Find("a[href^='item']").Last().Text()
|
|
|
|
return extractNumberFromString(commentText)
|
|
}
|
|
|
|
func extractNumberFromString(input string) (int, error) {
|
|
|
|
input = strings.TrimSpace(input)
|
|
re := regexp.MustCompile(`\d+`)
|
|
matches := re.FindString(input)
|
|
if matches == "" {
|
|
return 0, fmt.Errorf("no numbers found in input")
|
|
}
|
|
|
|
number, err := strconv.Atoi(matches)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return number, nil
|
|
}
|