Compare commits

...

2 Commits

2 changed files with 55 additions and 21 deletions

View File

@ -3,7 +3,11 @@ version: '3'
services: services:
driver: driver:
image: docker.io/selenium/standalone-chrome image: docker.io/selenium/standalone-firefox
environment:
- "SE_START_XVFB=false"
- "SE_NODE_OVERRIDE_MAX_SESSIONS=true"
- "SE_NODE_MAX_SESSIONS=${HAUNTER_MAX_SESSIONS:-5}"
ports: ports:
- '4444:4444' - '4444:4444'
# watcher: # watcher:

View File

@ -1,8 +1,10 @@
use std::cmp::Ordering;
use std::env; use std::env;
use std::str::FromStr; use std::str::FromStr;
use std::thread; use std::thread;
use std::time::Duration; use std::time::Duration;
use std::time::Instant; use std::time::Instant;
use thirtyfour_sync::WebDriverCommands; use thirtyfour_sync::WebDriverCommands;
mod conf; mod conf;
@ -25,6 +27,23 @@ struct ThreadJob<'a> {
last_result: Option<String>, last_result: Option<String>,
} }
impl ThreadJob<'_> {
fn lru_not_running(&self, b: &ThreadJob) -> Option<Ordering> {
// the "greater" value is one that is running, but we
// don't recheck the thread state every time. If there's
// a handle that isn't none, assume it's running.
if self.handle.is_some() != b.handle.is_some() {
if self.handle.is_some() {
return Some(Ordering::Greater);
}
else {
return Some(Ordering::Less);
}
}
return self.job.last_run.partial_cmp(&b.job.last_run);
}
}
fn main() { fn main() {
let mut conf = Conf { let mut conf = Conf {
job_dir: String::from_str("jobs.d").unwrap(), job_dir: String::from_str("jobs.d").unwrap(),
@ -69,24 +88,11 @@ fn main() {
last_result: None, last_result: None,
}); });
// @BUG: It seems the selenium chrome driver can't handle concurrent sessions from let max_running_tasks = 5;
// multiple threads. When the threads attempt to run concurrently, there are crashes,
// eg.
//
// thread '<unnamed>' panicked at 'failed to get url: UnknownError(WebDriverErrorInfo { status: 500, error: "", value: WebDriverErrorValue { message: "unknown error: session deleted because of page crash\nfrom tab crashed\n (Session info: chrome=105.0.5195.52)
//
// This should just run single jobs consecutively as a result.
loop { loop {
for tj in jobs.iter_mut() { let mut running_tasks = 0;
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every); for tj in jobs.iter_mut().filter(|job| job.handle.is_some()) {
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) { // Check if the task is done
tj.handle = Some(thread::spawn(|| {
return get_source(driver, tj.job.url);
}));
println!("Started thread for '{}'", tj.job.url);
tj.job.last_run = Some(Instant::now());
continue;
}
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() { if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
let duration = Instant::now().duration_since(tj.job.last_run.unwrap()); let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
tj.job.last_run = Some(Instant::now()); tj.job.last_run = Some(Instant::now());
@ -116,18 +122,42 @@ fn main() {
} }
tj.handle = None; tj.handle = None;
} }
else if tj.handle.is_some() {
running_tasks += 1;
}
}
while running_tasks < max_running_tasks {
// Sort by least recently run
// According to the docs, unstable_by is preferred for speed +
// reduced memory allocations, but doesn't guarantee order of
// equal elements.
jobs.sort_unstable_by(|a, b| a.lru_not_running(b).unwrap());
for tj in jobs.iter_mut() {
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
tj.handle = Some(thread::spawn(|| {
return get_source(driver, tj.job.url);
}));
println!("Started thread for '{}'", tj.job.url);
tj.job.last_run = Some(Instant::now());
running_tasks += 1;
if running_tasks >= max_running_tasks {
break;
}
}
}
break;
} }
std::thread::sleep(Duration::new(1, 0)); std::thread::sleep(Duration::new(1, 0));
} }
} }
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> { fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
let caps = thirtyfour_sync::DesiredCapabilities::chrome(); // The firefox driver seems to crash less often than the chrome driver.
let caps = thirtyfour_sync::DesiredCapabilities::firefox();
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver"); let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
driver.get(url).expect("failed to get url"); driver.get(url).expect("failed to get url");
let source = driver.page_source().expect("failed to get page source"); let source = driver.page_source().expect("failed to get page source");
driver.quit().expect("failed to close session"); driver.quit().expect("failed to close session");
return Ok(source); return Ok(source);
} }