haunter/src/main.rs

250 lines
8.7 KiB
Rust

use std::cmp::Ordering;
use std::env;
use std::path::Path;
use std::thread;
use std::time::Duration;
use std::time::Instant;
use thirtyfour_sync::WebDriverCommands;
mod conf;
use conf::Conf;
mod job;
use job::Job;
struct ThreadJob {
job: Job,
handle: Option<
std::thread::JoinHandle<
Result<String, &'static str>
>
>,
last_result: Option<String>,
}
impl ThreadJob {
fn lru_not_running(&self, b: &ThreadJob) -> Option<Ordering> {
// the "greater" value is one that is running, but we
// don't recheck the thread state every time. If there's
// a handle that isn't none, assume it's running.
if self.handle.is_some() != b.handle.is_some() {
if self.handle.is_some() {
return Some(Ordering::Greater);
}
else {
return Some(Ordering::Less);
}
}
return self.job.last_run.partial_cmp(&b.job.last_run);
}
}
fn verify_directory(dir: &Path) -> bool {
let md = match std::fs::metadata(&dir) {
Err(why) => {
println!("Cannot stat directory '{}': {}", dir.display(), why);
return false;
},
Ok(val) => val,
};
if !md.is_dir() {
println!("Output directory '{}' is not a directory", dir.display());
return false;
}
if md.permissions().readonly() {
println!("Output directory '{}' is read-only", dir.display());
return false;
}
return true;
}
fn main() {
let mut conf = Conf::get_default_conf();
conf.update_from_file(Path::new("/etc/haunter/haunter.conf"));
let mut args = env::args();
while let Some(arg) = args.next() {
if arg.eq("-f") {
match args.next() {
Some(file) => conf.update_from_file(Path::new(&file)),
_ => {println!("Missing argument after '-f'"); std::process::exit(1);},
};
}
}
if !verify_directory(&conf.output_dir) {
println!("Output directory unusable, aborting");
std::process::exit(1);
}
if !verify_directory(&conf.job_dir) {
println!("Output job unusable, aborting");
std::process::exit(1);
}
let mut jobs: Vec<ThreadJob> = Vec::new();
// Load all jobs from job directory
let job_dir = conf.job_dir.clone();
for entry in std::fs::read_dir(job_dir).expect("Failed to iterate over job directory") {
let _entry = match entry {
Err(why) => {
println!("Skipping '{}': Error reading file in job directory", why);
continue;
},
Ok(value) => value,
};
let md = _entry.metadata().expect("Failed to read file metadata");
if !md.is_file() {
println!("Skipping '{}': not a file", _entry.path().display());
continue;
}
match _entry.path().extension() {
Some(x) => {
if ! "job".eq(x) {
println!("Skipping '{}': does not have '.job' extension",
_entry.path().display());
continue;
}
},
None => {
println!("Skipping '{}': does not have '.job' extension",
_entry.path().display());
continue;
}
};
let job = match Job::from_file(&_entry.path(), &conf) {
Err(why) => {
println!("Failed to load job from '{}': {}", _entry.path().display(), why);
continue;
},
Ok(value) => value,
};
jobs.push(ThreadJob {
job: job,
handle: None,
last_result: None,
});
}
let max_running_tasks = 5;
loop {
let mut running_tasks = 0;
for tj in jobs.iter_mut().filter(|job| job.handle.is_some()) {
// Check if the task is done
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
tj.job.last_run = Some(Instant::now());
let source = tj.handle.take().unwrap().join();
if source.is_err() {
println!("Error for job '{}': {:?}", tj.job.url, source);
tj.handle = None;
continue;
}
let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
let selector = scraper::Selector::parse(&tj.job.selector.as_str()).expect("Failed to parse selector");
let mut result = String::from("");
for element in fragment.select(&selector) {
result.push_str(element.inner_html().as_str());
}
println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
if tj.last_result.is_none() {
println!("New result for job: '{}'\n", tj.job.url);
let diff = prettydiff::diff_lines(
"", result.as_str());
tj.job.update(result.as_str(), diff.to_string().as_str());
tj.last_result = Some(result);
}
else {
if tj.last_result.as_ref().unwrap().ne(&result) {
println!("Change detected for job '{}'", tj.job.url);
let diff = prettydiff::diff_lines(
tj.last_result.as_ref().unwrap(), result.as_str());
tj.job.update(result.as_str(), diff.to_string().as_str());
tj.last_result = Some(result);
}
}
tj.handle = None;
}
else if tj.handle.is_some() {
running_tasks += 1;
}
}
while running_tasks < max_running_tasks {
// Sort by least recently run
// According to the docs, unstable_by is preferred for speed +
// reduced memory allocations, but doesn't guarantee order of
// equal elements.
jobs.sort_unstable_by(|a, b| a.lru_not_running(b).unwrap());
for tj in jobs.iter_mut() {
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
let driver = conf.driver_url.clone();
let url = tj.job.url.clone();
tj.handle = Some(thread::spawn(move || {
return get_source(driver.as_str(), url.as_str());
}));
println!("Started thread for '{}'", tj.job.url);
tj.job.last_run = Some(Instant::now());
running_tasks += 1;
if running_tasks >= max_running_tasks {
break;
}
}
}
break;
}
std::thread::sleep(Duration::new(1, 0));
}
}
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
// The firefox driver seems to crash less often than the chrome driver.
let caps = thirtyfour_sync::DesiredCapabilities::firefox();
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
driver.get(url).expect("failed to get url");
let source = driver.page_source().expect("failed to get page source");
driver.quit().expect("failed to close session");
return Ok(source);
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile;
#[test]
fn verify_directory_exists_but_is_file() {
let tf = tempfile::NamedTempFile::new().unwrap();
assert!(!verify_directory(tf.path()));
}
#[test]
fn verify_directory_does_not_exist() {
assert!(!verify_directory(Path::new("/fake/path/that/does/not/exist")));
}
#[test]
fn verify_directory_exists() {
let tf = tempfile::tempdir().unwrap();
assert!(verify_directory(tf.path()));
}
#[test]
fn verify_directory_exists_but_is_not_writable() {
let tf = tempfile::tempdir().unwrap();
let md = std::fs::metadata(&tf.path()).unwrap();
let mut perms = md.permissions();
perms.set_readonly(true);
std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir read-only");
let result = verify_directory(tf.path());
perms = md.permissions();
perms.set_readonly(false);
std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir writable");
assert!(!result);
}
}