rsslair/html_parser.go

172 lines
3.8 KiB
Go

package main
import (
"strings"
"github.com/PuerkitoBio/goquery"
lua "github.com/yuin/gopher-lua"
)
type HtmlParser struct {
Doc *goquery.Document
}
const luaHtmlParserTypeName = "html"
var luaHtmlParserMethods = map[string]lua.LGFunction{
"select": select_html_node,
"remove": remove_html_node,
"get": get_document,
"get_attribute": get_html_node_attribute,
"rewrite_nodes": rewrite_html_nodes,
}
func registerHtmlParserType(L *lua.LState) {
logger.Debug("Registering html type")
mt := L.NewTypeMetatable(luaHtmlParserTypeName)
L.SetGlobal(luaHtmlParserTypeName, mt)
L.SetField(mt, "new", L.NewFunction(newHtmlParser))
L.SetField(mt, "__index", L.SetFuncs(L.NewTable(), luaHtmlParserMethods))
}
func checkHtmlParser(L *lua.LState) *HtmlParser {
ud := L.CheckUserData(1)
if v, ok := ud.Value.(*HtmlParser); ok {
return v
}
L.ArgError(1, "html_parser expected")
return nil
}
func newHtmlParser(L *lua.LState) int {
source := L.CheckString(1)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(source))
if err != nil {
logger.Error(err)
return 0
}
// Return doc as userdata
ud := L.NewUserData()
ud.Value = doc
L.SetMetatable(ud, L.GetTypeMetatable(luaHtmlParserTypeName))
L.Push(ud)
return 1;
}
func select_html_node(L *lua.LState) int {
ud := L.CheckUserData(1)
selector := L.CheckString(2)
doc, ok := ud.Value.(*goquery.Document)
if !ok {
logger.Error("Expected html_parser userdata")
return 0
}
var result []string
doc.Find(selector).Each(func(_ int, s *goquery.Selection) {
pHtml, err := s.Html()
if err != nil {
logger.Error(err)
return
}
result = append(result, pHtml)
})
L.Push(lua.LString(strings.Join(result, "\n")))
return 1
}
func remove_html_node(L *lua.LState) int {
ud := L.CheckUserData(1)
selector := L.CheckString(2)
doc, ok := ud.Value.(*goquery.Document)
if !ok {
logger.Error("Expected html_parser userdata")
return 0
}
doc.Find(selector).Each(func(_ int, s *goquery.Selection) {
s.Remove()
})
return 0
}
func get_html_node_attribute(L *lua.LState) int {
// Get the node, for example <img src="local/image.jpg"> -> <img src="http://example.com/image.jpg">
// Get the attribute, src
ud := L.CheckUserData(1)
node := L.CheckString(2)
attribute := L.CheckString(3)
doc, ok := ud.Value.(*goquery.Document)
if !ok {
logger.Error("Expected html_parser userdata")
return 0
}
result := []string{}
doc.Find(node).Each(func(_ int, s *goquery.Selection) {
pHtml, ok := s.Attr(attribute)
if !ok {
logger.Error("Could not find attribute: ", attribute, " in node: ", node)
return
}
result = append(result, pHtml)
})
table := L.NewTable()
for i := range result {
table.Append(lua.LString(result[i]))
}
L.Push(table)
return 1
}
// article:rewrite("img", "src", {"http://example.com/image.jpg", "http://example.com/image2.jpg"})
func rewrite_html_nodes(L *lua.LState) int {
// Get the node, for example <img src="local/image.jpg"> -> <img src="http://example.com/image.jpg">
// Get the attribute, src
ud := L.CheckUserData(1)
node := L.CheckString(2)
attribute := L.CheckString(3)
rewriteArr := L.CheckTable(4)
doc, ok := ud.Value.(*goquery.Document)
if !ok {
logger.Error("Expected html_parser userdata")
}
rewriteStrings := []string{}
for i := 1; i <= rewriteArr.Len(); i++ {
rewriteStrings = append(rewriteStrings, rewriteArr.RawGetInt(i).String())
}
i := 0
doc.Find(node).Each(func(_ int, s *goquery.Selection) {
rewrite := rewriteStrings[i]
s.SetAttr(attribute, rewrite)
logger.Debug("Rewrote: ", attribute, " to: ", rewrite)
i++
})
return 0
}
func get_document(L *lua.LState) int {
ud := L.CheckUserData(1)
doc, ok := ud.Value.(*goquery.Document)
if !ok {
logger.Error("Expected html_parser userdata")
return 0
}
html, err := doc.Html()
if err != nil {
logger.Error(err)
return 0
}
L.Push(lua.LString(html))
return 1
}