Initial commit
This commit is contained in:
commit
a6784fdce0
|
@ -0,0 +1 @@
|
||||||
|
/target
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,10 @@
|
||||||
|
[package]
|
||||||
|
name = "haunter"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
thirtyfour_sync = "0.27"
|
||||||
|
scraper = "0.13"
|
|
@ -0,0 +1,13 @@
|
||||||
|
---
|
||||||
|
version: '3'
|
||||||
|
|
||||||
|
services:
|
||||||
|
driver:
|
||||||
|
image: docker.io/selenium/standalone-chrome
|
||||||
|
ports:
|
||||||
|
- '4444:4444'
|
||||||
|
# watcher:
|
||||||
|
# build_context:
|
||||||
|
# - ./
|
||||||
|
# depends_on:
|
||||||
|
# - driver
|
|
@ -0,0 +1,108 @@
|
||||||
|
use std::option;
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
use std::time::Instant;
|
||||||
|
use thirtyfour_sync::WebDriverCommands;
|
||||||
|
|
||||||
|
struct Job<'a> {
|
||||||
|
url: &'a str,
|
||||||
|
selector: &'a str,
|
||||||
|
every: Duration,
|
||||||
|
last_run: Option<Instant>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ThreadJob<'a> {
|
||||||
|
job: Job<'a>,
|
||||||
|
handle: Option<
|
||||||
|
std::thread::JoinHandle<
|
||||||
|
Result<String, &'static str>
|
||||||
|
>
|
||||||
|
>,
|
||||||
|
last_result: Option<String>,
|
||||||
|
}
|
||||||
|
fn main() {
|
||||||
|
let driver = "http://localhost:4444";
|
||||||
|
let mut jobs = Vec::new();
|
||||||
|
let some_job = Job {
|
||||||
|
url: "https://www.rust-lang.org",
|
||||||
|
selector: "a.download-link",
|
||||||
|
every: Duration::new(60, 0),
|
||||||
|
last_run: None,
|
||||||
|
};
|
||||||
|
let other_job = Job {
|
||||||
|
url: "https://arstechnica.com/",
|
||||||
|
selector: "li.split-feature:nth-child(1) > header:nth-child(4) > h2:nth-child(1) > a:nth-child(1)",
|
||||||
|
every: Duration::new(120, 0),
|
||||||
|
last_run: None,
|
||||||
|
};
|
||||||
|
jobs.push(ThreadJob {
|
||||||
|
job: some_job,
|
||||||
|
handle: None,
|
||||||
|
last_result: None,
|
||||||
|
});
|
||||||
|
jobs.push(ThreadJob {
|
||||||
|
job: other_job,
|
||||||
|
handle: None,
|
||||||
|
last_result: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
// @BUG: It seems the selenium chrome driver can't handle concurrent sessions from
|
||||||
|
// multiple threads. When the threads attempt to run concurrently, there are crashes,
|
||||||
|
// eg.
|
||||||
|
//
|
||||||
|
// thread '<unnamed>' panicked at 'failed to get url: UnknownError(WebDriverErrorInfo { status: 500, error: "", value: WebDriverErrorValue { message: "unknown error: session deleted because of page crash\nfrom tab crashed\n (Session info: chrome=105.0.5195.52)
|
||||||
|
//
|
||||||
|
// This should just run single jobs consecutively as a result.
|
||||||
|
loop {
|
||||||
|
for tj in jobs.iter_mut() {
|
||||||
|
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
|
||||||
|
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
|
||||||
|
tj.handle = Some(thread::spawn(|| {
|
||||||
|
return get_source(driver, tj.job.url);
|
||||||
|
}));
|
||||||
|
println!("Started thread for '{}'", tj.job.url);
|
||||||
|
tj.job.last_run = Some(Instant::now());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
|
||||||
|
let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
|
||||||
|
tj.job.last_run = Some(Instant::now());
|
||||||
|
let source = tj.handle.take().unwrap().join();
|
||||||
|
if source.is_err() {
|
||||||
|
println!("Error for job '{}': {:?}", tj.job.url, source);
|
||||||
|
tj.handle = None;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
|
||||||
|
let selector = scraper::Selector::parse(tj.job.selector).expect("Failed to parse selector");
|
||||||
|
let mut result = String::from("");
|
||||||
|
for element in fragment.select(&selector) {
|
||||||
|
result.push_str(element.inner_html().as_str());
|
||||||
|
}
|
||||||
|
println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
|
||||||
|
if tj.last_result.is_none() {
|
||||||
|
println!("New result: '{}'\n", result);
|
||||||
|
tj.last_result = Some(result);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if tj.last_result.as_ref().unwrap().ne(&result) {
|
||||||
|
println!("Change detected\nOld value: '{}'\nNew value: '{}'",
|
||||||
|
tj.last_result.as_ref().unwrap(), result);
|
||||||
|
tj.last_result = Some(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tj.handle = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::thread::sleep(Duration::new(1, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
|
||||||
|
let caps = thirtyfour_sync::DesiredCapabilities::chrome();
|
||||||
|
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
|
||||||
|
driver.get(url).expect("failed to get url");
|
||||||
|
let source = driver.page_source().expect("failed to get page source");
|
||||||
|
driver.quit().expect("failed to close session");
|
||||||
|
return Ok(source);
|
||||||
|
}
|
Loading…
Reference in New Issue