Initial commit

This commit is contained in:
Kienan Stewart 2022-09-23 19:56:04 -04:00
commit a6784fdce0
6 changed files with 1891 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

1756
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

10
Cargo.toml Normal file
View File

@ -0,0 +1,10 @@
[package]
name = "haunter"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
thirtyfour_sync = "0.27"
scraper = "0.13"

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# Haunter
Watches web sites for changes

13
container-compose.yml Normal file
View File

@ -0,0 +1,13 @@
---
version: '3'
services:
driver:
image: docker.io/selenium/standalone-chrome
ports:
- '4444:4444'
# watcher:
# build_context:
# - ./
# depends_on:
# - driver

108
src/main.rs Normal file
View File

@ -0,0 +1,108 @@
use std::option;
use std::thread;
use std::time::Duration;
use std::time::Instant;
use thirtyfour_sync::WebDriverCommands;
struct Job<'a> {
url: &'a str,
selector: &'a str,
every: Duration,
last_run: Option<Instant>,
}
struct ThreadJob<'a> {
job: Job<'a>,
handle: Option<
std::thread::JoinHandle<
Result<String, &'static str>
>
>,
last_result: Option<String>,
}
fn main() {
let driver = "http://localhost:4444";
let mut jobs = Vec::new();
let some_job = Job {
url: "https://www.rust-lang.org",
selector: "a.download-link",
every: Duration::new(60, 0),
last_run: None,
};
let other_job = Job {
url: "https://arstechnica.com/",
selector: "li.split-feature:nth-child(1) > header:nth-child(4) > h2:nth-child(1) > a:nth-child(1)",
every: Duration::new(120, 0),
last_run: None,
};
jobs.push(ThreadJob {
job: some_job,
handle: None,
last_result: None,
});
jobs.push(ThreadJob {
job: other_job,
handle: None,
last_result: None,
});
// @BUG: It seems the selenium chrome driver can't handle concurrent sessions from
// multiple threads. When the threads attempt to run concurrently, there are crashes,
// eg.
//
// thread '<unnamed>' panicked at 'failed to get url: UnknownError(WebDriverErrorInfo { status: 500, error: "", value: WebDriverErrorValue { message: "unknown error: session deleted because of page crash\nfrom tab crashed\n (Session info: chrome=105.0.5195.52)
//
// This should just run single jobs consecutively as a result.
loop {
for tj in jobs.iter_mut() {
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
tj.handle = Some(thread::spawn(|| {
return get_source(driver, tj.job.url);
}));
println!("Started thread for '{}'", tj.job.url);
tj.job.last_run = Some(Instant::now());
continue;
}
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
tj.job.last_run = Some(Instant::now());
let source = tj.handle.take().unwrap().join();
if source.is_err() {
println!("Error for job '{}': {:?}", tj.job.url, source);
tj.handle = None;
continue;
}
let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
let selector = scraper::Selector::parse(tj.job.selector).expect("Failed to parse selector");
let mut result = String::from("");
for element in fragment.select(&selector) {
result.push_str(element.inner_html().as_str());
}
println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
if tj.last_result.is_none() {
println!("New result: '{}'\n", result);
tj.last_result = Some(result);
}
else {
if tj.last_result.as_ref().unwrap().ne(&result) {
println!("Change detected\nOld value: '{}'\nNew value: '{}'",
tj.last_result.as_ref().unwrap(), result);
tj.last_result = Some(result);
}
}
tj.handle = None;
}
}
std::thread::sleep(Duration::new(1, 0));
}
}
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
let caps = thirtyfour_sync::DesiredCapabilities::chrome();
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
driver.get(url).expect("failed to get url");
let source = driver.page_source().expect("failed to get page source");
driver.quit().expect("failed to close session");
return Ok(source);
}