use std::cmp::Ordering; use std::env; use std::path::Path; use std::thread; use std::time::Duration; use std::time::Instant; use thirtyfour_sync::WebDriverCommands; mod conf; use conf::Conf; mod job; use job::Job; struct ThreadJob { job: Job, handle: Option< std::thread::JoinHandle< Result > >, last_result: Option, } impl ThreadJob { fn lru_not_running(&self, b: &ThreadJob) -> Option { // the "greater" value is one that is running, but we // don't recheck the thread state every time. If there's // a handle that isn't none, assume it's running. if self.handle.is_some() != b.handle.is_some() { if self.handle.is_some() { return Some(Ordering::Greater); } else { return Some(Ordering::Less); } } return self.job.last_run.partial_cmp(&b.job.last_run); } } fn verify_directory(dir: &Path) -> bool { let md = match std::fs::metadata(&dir) { Err(why) => { println!("Cannot stat directory '{}': {}", dir.display(), why); return false; }, Ok(val) => val, }; if !md.is_dir() { println!("Output directory '{}' is not a directory", dir.display()); return false; } if md.permissions().readonly() { println!("Output directory '{}' is read-only", dir.display()); return false; } return true; } fn main() { let mut conf = Conf::get_default_conf(); conf.update_from_file(Path::new("/etc/haunter/haunter.conf")); let mut args = env::args(); while let Some(arg) = args.next() { if arg.eq("-f") { match args.next() { Some(file) => conf.update_from_file(Path::new(&file)), _ => {println!("Missing argument after '-f'"); std::process::exit(1);}, }; } } if !verify_directory(&conf.output_dir) { println!("Output directory unusable, aborting"); std::process::exit(1); } if !verify_directory(&conf.job_dir) { println!("Output job unusable, aborting"); std::process::exit(1); } let mut jobs: Vec = Vec::new(); // Load all jobs from job directory let job_dir = conf.job_dir.clone(); for entry in std::fs::read_dir(job_dir).expect("Failed to iterate over job directory") { let _entry = match entry { Err(why) => { println!("Skipping '{}': Error reading file in job directory", why); continue; }, Ok(value) => value, }; let md = _entry.metadata().expect("Failed to read file metadata"); if !md.is_file() { println!("Skipping '{}': not a file", _entry.path().display()); continue; } match _entry.path().extension() { Some(x) => { if ! "job".eq(x) { println!("Skipping '{}': does not have '.job' extension", _entry.path().display()); continue; } }, None => { println!("Skipping '{}': does not have '.job' extension", _entry.path().display()); continue; } }; let job = match Job::from_file(&_entry.path(), &conf) { Err(why) => { println!("Failed to load job from '{}': {}", _entry.path().display(), why); continue; }, Ok(value) => value, }; jobs.push(ThreadJob { job: job, handle: None, last_result: None, }); } let max_running_tasks = 5; loop { let mut running_tasks = 0; for tj in jobs.iter_mut().filter(|job| job.handle.is_some()) { // Check if the task is done if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() { let duration = Instant::now().duration_since(tj.job.last_run.unwrap()); tj.job.last_run = Some(Instant::now()); let source = tj.handle.take().unwrap().join(); if source.is_err() { println!("Error for job '{}': {:?}", tj.job.url, source); tj.handle = None; continue; } let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str()); let selector = scraper::Selector::parse(&tj.job.selector.as_str()).expect("Failed to parse selector"); let mut result = String::from(""); for element in fragment.select(&selector) { result.push_str(element.inner_html().as_str()); } println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs()); if tj.last_result.is_none() { println!("New result for job: '{}'\n", tj.job.url); let diff = prettydiff::diff_lines( "", result.as_str()); tj.job.update(result.as_str(), diff.to_string().as_str()); tj.last_result = Some(result); } else { if tj.last_result.as_ref().unwrap().ne(&result) { println!("Change detected for job '{}'", tj.job.url); let diff = prettydiff::diff_lines( tj.last_result.as_ref().unwrap(), result.as_str()); tj.job.update(result.as_str(), diff.to_string().as_str()); tj.last_result = Some(result); } } tj.handle = None; } else if tj.handle.is_some() { running_tasks += 1; } } while running_tasks < max_running_tasks { // Sort by least recently run // According to the docs, unstable_by is preferred for speed + // reduced memory allocations, but doesn't guarantee order of // equal elements. jobs.sort_unstable_by(|a, b| a.lru_not_running(b).unwrap()); for tj in jobs.iter_mut() { let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every); if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) { let driver = conf.driver_url.clone(); let url = tj.job.url.clone(); tj.handle = Some(thread::spawn(move || { return get_source(driver.as_str(), url.as_str()); })); println!("Started thread for '{}'", tj.job.url); tj.job.last_run = Some(Instant::now()); running_tasks += 1; if running_tasks >= max_running_tasks { break; } } } break; } std::thread::sleep(Duration::new(1, 0)); } } fn get_source(driver: &str, url: &str) -> Result { // The firefox driver seems to crash less often than the chrome driver. let caps = thirtyfour_sync::DesiredCapabilities::firefox(); let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver"); driver.get(url).expect("failed to get url"); let source = driver.page_source().expect("failed to get page source"); driver.quit().expect("failed to close session"); return Ok(source); } #[cfg(test)] mod tests { use super::*; use tempfile; #[test] fn verify_directory_exists_but_is_file() { let tf = tempfile::NamedTempFile::new().unwrap(); assert!(!verify_directory(tf.path())); } #[test] fn verify_directory_does_not_exist() { assert!(!verify_directory(Path::new("/fake/path/that/does/not/exist"))); } #[test] fn verify_directory_exists() { let tf = tempfile::tempdir().unwrap(); assert!(verify_directory(tf.path())); } #[test] fn verify_directory_exists_but_is_not_writable() { let tf = tempfile::tempdir().unwrap(); let md = std::fs::metadata(&tf.path()).unwrap(); let mut perms = md.permissions(); perms.set_readonly(true); std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir read-only"); let result = verify_directory(tf.path()); perms = md.permissions(); perms.set_readonly(false); std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir writable"); assert!(!result); } }