Initial commit
This commit is contained in:
commit
a6784fdce0
|
@ -0,0 +1 @@
|
|||
/target
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,10 @@
|
|||
[package]
|
||||
name = "haunter"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
thirtyfour_sync = "0.27"
|
||||
scraper = "0.13"
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
driver:
|
||||
image: docker.io/selenium/standalone-chrome
|
||||
ports:
|
||||
- '4444:4444'
|
||||
# watcher:
|
||||
# build_context:
|
||||
# - ./
|
||||
# depends_on:
|
||||
# - driver
|
|
@ -0,0 +1,108 @@
|
|||
use std::option;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use thirtyfour_sync::WebDriverCommands;
|
||||
|
||||
struct Job<'a> {
|
||||
url: &'a str,
|
||||
selector: &'a str,
|
||||
every: Duration,
|
||||
last_run: Option<Instant>,
|
||||
}
|
||||
|
||||
struct ThreadJob<'a> {
|
||||
job: Job<'a>,
|
||||
handle: Option<
|
||||
std::thread::JoinHandle<
|
||||
Result<String, &'static str>
|
||||
>
|
||||
>,
|
||||
last_result: Option<String>,
|
||||
}
|
||||
fn main() {
|
||||
let driver = "http://localhost:4444";
|
||||
let mut jobs = Vec::new();
|
||||
let some_job = Job {
|
||||
url: "https://www.rust-lang.org",
|
||||
selector: "a.download-link",
|
||||
every: Duration::new(60, 0),
|
||||
last_run: None,
|
||||
};
|
||||
let other_job = Job {
|
||||
url: "https://arstechnica.com/",
|
||||
selector: "li.split-feature:nth-child(1) > header:nth-child(4) > h2:nth-child(1) > a:nth-child(1)",
|
||||
every: Duration::new(120, 0),
|
||||
last_run: None,
|
||||
};
|
||||
jobs.push(ThreadJob {
|
||||
job: some_job,
|
||||
handle: None,
|
||||
last_result: None,
|
||||
});
|
||||
jobs.push(ThreadJob {
|
||||
job: other_job,
|
||||
handle: None,
|
||||
last_result: None,
|
||||
});
|
||||
|
||||
// @BUG: It seems the selenium chrome driver can't handle concurrent sessions from
|
||||
// multiple threads. When the threads attempt to run concurrently, there are crashes,
|
||||
// eg.
|
||||
//
|
||||
// thread '<unnamed>' panicked at 'failed to get url: UnknownError(WebDriverErrorInfo { status: 500, error: "", value: WebDriverErrorValue { message: "unknown error: session deleted because of page crash\nfrom tab crashed\n (Session info: chrome=105.0.5195.52)
|
||||
//
|
||||
// This should just run single jobs consecutively as a result.
|
||||
loop {
|
||||
for tj in jobs.iter_mut() {
|
||||
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
|
||||
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
|
||||
tj.handle = Some(thread::spawn(|| {
|
||||
return get_source(driver, tj.job.url);
|
||||
}));
|
||||
println!("Started thread for '{}'", tj.job.url);
|
||||
tj.job.last_run = Some(Instant::now());
|
||||
continue;
|
||||
}
|
||||
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
|
||||
let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
|
||||
tj.job.last_run = Some(Instant::now());
|
||||
let source = tj.handle.take().unwrap().join();
|
||||
if source.is_err() {
|
||||
println!("Error for job '{}': {:?}", tj.job.url, source);
|
||||
tj.handle = None;
|
||||
continue;
|
||||
}
|
||||
let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
|
||||
let selector = scraper::Selector::parse(tj.job.selector).expect("Failed to parse selector");
|
||||
let mut result = String::from("");
|
||||
for element in fragment.select(&selector) {
|
||||
result.push_str(element.inner_html().as_str());
|
||||
}
|
||||
println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
|
||||
if tj.last_result.is_none() {
|
||||
println!("New result: '{}'\n", result);
|
||||
tj.last_result = Some(result);
|
||||
}
|
||||
else {
|
||||
if tj.last_result.as_ref().unwrap().ne(&result) {
|
||||
println!("Change detected\nOld value: '{}'\nNew value: '{}'",
|
||||
tj.last_result.as_ref().unwrap(), result);
|
||||
tj.last_result = Some(result);
|
||||
}
|
||||
}
|
||||
tj.handle = None;
|
||||
}
|
||||
}
|
||||
std::thread::sleep(Duration::new(1, 0));
|
||||
}
|
||||
}
|
||||
|
||||
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
|
||||
let caps = thirtyfour_sync::DesiredCapabilities::chrome();
|
||||
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
|
||||
driver.get(url).expect("failed to get url");
|
||||
let source = driver.page_source().expect("failed to get page source");
|
||||
driver.quit().expect("failed to close session");
|
||||
return Ok(source);
|
||||
}
|
Loading…
Reference in New Issue