Add working implementation of multiple RSS feeds

- Add phoronix, 9to5mac and techcrunch scripts
- Add clap for basic argument parsing for disabling the scheduler
- Add clap argument for entering a cron string (the default value is
  wrong right now)
- Add `check_if_articles_in_feed_exist` that will return articles from
  the database that already exist
- Print out lua errors rather than just printing a generic error
This commit is contained in:
Christopher Williams 2024-09-26 19:33:01 -04:00
parent 62b2909668
commit 05ce7537b2
10 changed files with 348 additions and 37 deletions

128
Cargo.lock generated
View File

@ -290,6 +290,46 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901"
[[package]]
name = "clap"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.77",
]
[[package]]
name = "clap_lex"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "cloudabi"
version = "0.0.3"
@ -368,6 +408,15 @@ dependencies = [
"once_cell",
]
[[package]]
name = "cron-lingo"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a82ada06b3cc8c1a50fdf20e56953c24f01ef86e92dde7f080d7174f9c70a252"
dependencies = [
"time 0.3.36",
]
[[package]]
name = "crossbeam-deque"
version = "0.7.4"
@ -455,6 +504,15 @@ version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2"
[[package]]
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
]
[[package]]
name = "derive_more"
version = "0.99.18"
@ -840,6 +898,12 @@ dependencies = [
"http 0.2.12",
]
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.9"
@ -989,7 +1053,7 @@ dependencies = [
"log",
"net2",
"rustc_version",
"time",
"time 0.1.45",
"tokio 0.1.22",
"tokio-buf",
"tokio-executor",
@ -1539,6 +1603,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-integer"
version = "0.1.46"
@ -1567,6 +1637,15 @@ dependencies = [
"libc",
]
[[package]]
name = "num_threads"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9"
dependencies = [
"libc",
]
[[package]]
name = "object"
version = "0.36.4"
@ -1809,6 +1888,12 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
@ -2060,7 +2145,10 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"clap",
"clap_derive",
"cron",
"cron-lingo",
"env_logger",
"html5ever 0.29.0",
"hyper 1.4.1",
@ -2476,6 +2564,12 @@ dependencies = [
"quote",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
@ -2597,6 +2691,38 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "time"
version = "0.3.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
dependencies = [
"deranged",
"libc",
"num-conv",
"num_threads",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "tiny_http"
version = "0.12.0"

View File

@ -8,7 +8,10 @@ edition = "2021"
[dependencies]
anyhow = "1.0.87"
chrono = "0.4.38"
clap = { version = "4.5.18", features = ["derive"] }
clap_derive = "4.5.18"
cron = "0.12.1"
cron-lingo = "0.4.2"
env_logger = "0.11.5"
html5ever = "0.29.0"
hyper = "1.4.1"

28
scripts/.example.lua Normal file
View File

@ -0,0 +1,28 @@
local WEBSITE_NAME = "TechCrunch"
local WEBSITE_HOME = "https://techcrunch.com"
add_route("techCrunch", "/TechCrunch")
techCrunch = {}
function techCrunch.route(args)
local xml = get("http://localhost:8081/feed.xml") -- Get an xml RSS feed
local rss_parser = Feed() -- Create a new instance of the Feed object
local feed = rss_parser:new(xml) -- Parse the xml into a feed object
sleep(1000)
local articles = feed.channel.articles -- Get all of the article objects
local article = articles[1] -- Get the first article object
log:info("Title: " .. article.title)
log:info("Description: " .. article.description)
local article_content = get(article.link) -- Get the entire article content
local html_parser = HtmlParser() -- Create a new instance of the html parser
html_parser:parse(article_content) -- Parse the article into an html tree
local elements = html_parser:select_element('.wp-block-post-content') -- Select the element with the class 'wp-block-post-content'
local element = elements[1] -- String of the html from the element selected
article.description = element -- Replace the description with the entire article
return feed:render(), feed
end

View File

@ -3,9 +3,46 @@ local WEBSITE_HOME = "https://9to5mac.com"
add_route("Nineto5mac", "/9to5mac")
Nineto5mac = {}
function Nineto5mac.rout(args)
print("Executing the Nineto5mac.route")
return "NineTo5Mac XML"
function Nineto5mac.route(args)
local xml = get("https://9to5mac.com/feed/")
local rss_parser = Feed()
local feed = rss_parser:new(xml)
local articles = feed.channel.articles
local db = Database()
local existing_articles = db:check_if_articles_in_feed_exist(feed)
log:debug("Fetching missing articles from the database")
for _, article in ipairs(articles) do
if existing_articles[article.guid.value] then
log:debug("Article already exists in the database: " .. article.title)
article.description = existing_articles[article.guid.value]
goto continue
end
log:debug("Getting article: " .. article.title .. " from " .. article.link)
local article_content = get(article.link)
local html_parser = HtmlParser()
html_parser:parse(article_content)
-- Remove sections
html_parser:delete_element("header")
html_parser:delete_element("script")
html_parser:delete_element(".ad-disclaimer-container")
html_parser:delete_element("#after_disclaimer_placement")
html_parser:delete_element(".adsbygoogle")
html_parser:delete_element(".google-news-link")
local elements = html_parser:select_element('.post-content')
local element = elements
[1]
article.description =
element
sleep(500)
::continue::
end
return feed:render(), feed
end

43
scripts/phoronix.lua Normal file
View File

@ -0,0 +1,43 @@
-- TODO: Once the feed is sufficently long, we could make a bigger feed since the feed is only 20 articles long
local WEBSITE_NAME = "Phronix"
local WEBSITE_HOME = "https://phoronix.com"
add_route("phoronix", "/phoronix")
phoronix = {}
function phoronix.route(args)
local xml = get("https://www.phoronix.com/rss.php") -- Get the xml from the website
local rss_parser = Feed() -- Create a new instance of the Feed object
local feed = rss_parser:new(xml) -- Parse the xml into a feed object
local articles = feed.channel.articles -- Get all of the article objects
-- TODO: Add api to check if the articles are already in the database
local db = Database()
local existing_articles = db:check_if_articles_in_feed_exist(feed) -- Get the missing articles from the database
log:debug("Fetching missing articles from the database")
for _, article in ipairs(articles) do
if existing_articles[article.guid.value] then
log:debug("Article already exists in the database: " .. article.title)
article.description = existing_articles[article.guid.value]
goto continue
end
log:debug("Getting article: " .. article.title .. " from " .. article.link)
local article_content = get(article.link) -- Get the entire article content
local html_parser = HtmlParser() -- Create a new instance of the html parser
html_parser:parse(article_content) -- Parse the article into an html tree
local elements = html_parser:select_element('.content') -- Select the element with the class 'wp-block-post-content'
local element = elements
[1] -- String of the html from the element selected
article.description =
element -- Replace the description with the entire article
sleep(500)
::continue::
end
return feed:render(), feed
end

View File

@ -1,3 +1,4 @@
-- TODO: Once the feed is sufficently long, we could make a bigger feed since the feed is only 20 articles long
local WEBSITE_NAME = "TechCrunch"
local WEBSITE_HOME = "https://techcrunch.com"
@ -5,22 +6,38 @@ add_route("techCrunch", "/TechCrunch")
techCrunch = {}
function techCrunch.route(args)
local xml = get("http://localhost:8081/feed.xml") -- Get an xml RSS feed
local rss_parser = Feed() -- Create a new instance of the Feed object
local feed = rss_parser:new(xml) -- Parse the xml into a feed object
local xml = get("https://techcrunch.com/feed/") -- Get the xml from the website
local rss_parser = Feed() -- Create a new instance of the Feed object
local feed = rss_parser:new(xml) -- Parse the xml into a feed object
local articles = feed.channel.articles -- Get all of the article objects
local article = articles[1] -- Get the first article object
log:info("Title: " .. article.title)
log:info("Description: " .. article.description)
local articles = feed.channel.articles -- Get all of the article objects
local article_content = get(article.link) -- Get the entire article content
local html_parser = HtmlParser() -- Create a new instance of the html parser
html_parser:parse(article_content) -- Parse the article into an html tree
-- TODO: Add api to check if the articles are already in the database
local db = Database()
local existing_articles = db:check_if_articles_in_feed_exist(feed) -- Get the missing articles from the database
local elements = html_parser:select_element('.wp-block-post-content') -- Select the element with the class 'wp-block-post-content'
local element = elements[1] -- String of the html from the element selected
article.description = element -- Replace the description with the entire article
log:debug("Fetching missing articles from the database")
for _, article in ipairs(articles) do
if existing_articles[article.guid.value] then
log:debug("Article already exists in the database: " .. article.title)
article.description = existing_articles[article.guid.value]
goto continue
end
return feed:render()
log:debug("Getting article: " .. article.title .. " from " .. article.link)
local article_content = get(article.link) -- Get the entire article content
local html_parser = HtmlParser() -- Create a new instance of the html parser
html_parser:parse(article_content) -- Parse the article into an html tree
local elements = html_parser:select_element('.wp-block-post-content') -- Select the element with the class 'wp-block-post-content'
local element = elements
[1] -- String of the html from the element selected
article.description =
element -- Replace the description with the entire article
sleep(500)
::continue::
end
return feed:render(), feed
end

View File

@ -1,9 +1,9 @@
use std::rc::Rc;
use std::{collections::HashMap, rc::Rc};
use log::{debug, error, trace};
use mlua::{Lua, Value, FromLua, UserData, UserDataMethods};
use mlua::{FromLua, Lua, Table, UserData, UserDataMethods, Value};
use rusqlite::{params, Connection};
use crate::{implement_from_lua, rss_parser::Item as Article};
use crate::{implement_from_lua, rss_parser::{Item as Article, Rss}};
#[derive(Debug, Clone)]
pub struct Database {
@ -132,9 +132,12 @@ impl Database {
);
match ret {
Ok(_) => Ok(ret?),
Err(e) => {
error!("Error inserting {} into database: {}", title, e);
Err(e.into())
Err(_) => {
// TODO: Do we care about entries that already exist?
//error!("Error inserting {} into database: {}", title, e);
//Err(e.into())
Ok(0)
}
}
}
@ -221,6 +224,31 @@ impl UserData for Database {
}
});
methods.add_method("check_if_articles_in_feed_exist", |lua, this, feed: Rss| -> Result<Table, mlua::Error> {
// Check to if all of the articles in the feed exist
let table = lua.create_table()?;
let articles = &feed.channel.borrow().items;
for article in articles {
let article = article.borrow();
let guid = &article.guid.as_ref().unwrap().value;
let guid = guid.clone().unwrap();
match this.get_article_by_guid(&guid) {
Ok(Some(article)) => {
table.set(guid, article.description.clone().unwrap())?;
},
Ok(None) => {
continue;
},
Err(e) => {
error!("Error checking if article exists: {}", e);
},
}
}
Ok(table)
});
//methods.add_method("does_article_exist", |_, this, guid: String| -> Result<Result<bool, _>, mlua::Error> {
// Ok(this.does_article_exist(&guid))
//});

View File

@ -3,6 +3,7 @@ use std::collections::HashMap;
use std::sync::{Arc, Mutex, RwLock};
use std::thread;
use once_cell::sync::Lazy;
use clap::Parser;
mod html_parser;
mod router;
@ -11,6 +12,19 @@ mod scheduler;
mod scripting;
mod database;
#[derive(Parser)]
#[command(version, about, long_about = None)]
struct Cli {
#[arg(short, long, default_value = "8000")]
port: u16,
#[arg(short, long)]
no_scheduler: bool,
#[arg(short, long, default_value = "1m")]
cron: String,
}
static REDIS: Lazy<Mutex<redis::Connection>> = Lazy::new(|| {
let client = redis::Client::open("redis://127.0.0.1").unwrap();
let conn = client.get_connection().unwrap();
@ -19,6 +33,11 @@ static REDIS: Lazy<Mutex<redis::Connection>> = Lazy::new(|| {
#[tokio::main]
async fn main() {
let cli = Cli::parse();
let port = cli.port;
let cron = cli.cron;
let no_scheduler = cli.no_scheduler;
env_logger::init();
let routes = Arc::new(RwLock::new(HashMap::new()));
@ -40,15 +59,17 @@ async fn main() {
.unwrap()
.insert("main".to_string(), "/".to_string());
{
// NOTE: Set the routes as ready
// This is in a block so the lock is released after setting the value
let mut r = is_ready_ref.lock().unwrap();
*r = true;
let mut r = is_ready_ref.lock().unwrap();
*r = true;
std::mem::drop(r); // Unlock the mutex
if no_scheduler {
return;
}
let mut scheduler = scheduler::Scheduler::new(engine).unwrap(); //TODO: Handle error
scheduler.schedule("0/5 * * * * *").unwrap();
//scheduler.schedule("* 1 * * * *").unwrap();
scheduler.schedule(&cron).unwrap();
});
let is_ready = is_ready.clone();
@ -58,7 +79,7 @@ async fn main() {
debug!("Routes are ready");
let routes = routes.read().unwrap().clone();
let router = router::Router::new(routes.clone(), 8000);
let router = router::Router::new(routes.clone(), port);
info!("Successfully created router and scripting engine");
router.serve().await;

View File

@ -1,7 +1,6 @@
use anyhow::Result;
use chrono::prelude::*;
use chrono::Utc;
use cron::Schedule;
use log::debug;
use log::error;
use log::trace;
@ -75,7 +74,7 @@ impl Scheduler {
/// Example of a cron expression every minute: `0/1 * * * *`
pub fn schedule(&mut self, cron_expr: &str) -> Result<()> {
let schedule = Schedule::from_str(cron_expr).unwrap();
let schedule = cron::Schedule::from_str(cron_expr).unwrap();
while let Some(next) = schedule.upcoming(Utc).next() {
debug!("Scheduled task has started running at {}", Local::now());
self.execute_all_routes()?;

View File

@ -52,7 +52,15 @@ impl ScriptingEngine {
}
pub fn load_script(&self, script: &str) -> Result<()> {
self.lua.load(script).exec()
match self.lua.load(script).exec() {
Ok(_) => Ok(()),
Err(err) => {
//TODO: Should this be an error and stop loading everything?
error!("Error loading script: {}", err);
error!("Not loading script: {}", script);
Ok(())
}
}
}
pub fn load_script_with_return(&self, script: &str) -> Result<String> {
@ -161,9 +169,10 @@ impl ScriptingEngine {
};
let result = match route_func.call::<_, (String, Rss)>(()) {
Ok(result) => result,
Err(_) => {
Err(e) => {
// Missing the right return values drops down here
error!("Error executing route function for route {}", obj_name);
error!("Lua error: {}", e);
return Err(mlua::Error::external("Error executing route function"));
}
};