250 lines
8.7 KiB
Rust
250 lines
8.7 KiB
Rust
use std::cmp::Ordering;
|
|
use std::env;
|
|
use std::path::Path;
|
|
use std::thread;
|
|
use std::time::Duration;
|
|
use std::time::Instant;
|
|
|
|
use thirtyfour_sync::WebDriverCommands;
|
|
|
|
mod conf;
|
|
use conf::Conf;
|
|
|
|
mod job;
|
|
use job::Job;
|
|
|
|
struct ThreadJob {
|
|
job: Job,
|
|
handle: Option<
|
|
std::thread::JoinHandle<
|
|
Result<String, &'static str>
|
|
>
|
|
>,
|
|
last_result: Option<String>,
|
|
}
|
|
|
|
impl ThreadJob {
|
|
fn lru_not_running(&self, b: &ThreadJob) -> Option<Ordering> {
|
|
// the "greater" value is one that is running, but we
|
|
// don't recheck the thread state every time. If there's
|
|
// a handle that isn't none, assume it's running.
|
|
if self.handle.is_some() != b.handle.is_some() {
|
|
if self.handle.is_some() {
|
|
return Some(Ordering::Greater);
|
|
}
|
|
else {
|
|
return Some(Ordering::Less);
|
|
}
|
|
}
|
|
return self.job.last_run.partial_cmp(&b.job.last_run);
|
|
}
|
|
}
|
|
|
|
fn verify_directory(dir: &Path) -> bool {
|
|
let md = match std::fs::metadata(&dir) {
|
|
Err(why) => {
|
|
println!("Cannot stat directory '{}': {}", dir.display(), why);
|
|
return false;
|
|
},
|
|
Ok(val) => val,
|
|
};
|
|
if !md.is_dir() {
|
|
println!("Output directory '{}' is not a directory", dir.display());
|
|
return false;
|
|
}
|
|
if md.permissions().readonly() {
|
|
println!("Output directory '{}' is read-only", dir.display());
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
fn main() {
|
|
let mut conf = Conf::get_default_conf();
|
|
conf.update_from_file(Path::new("/etc/haunter/haunter.conf"));
|
|
|
|
let mut args = env::args();
|
|
while let Some(arg) = args.next() {
|
|
if arg.eq("-f") {
|
|
match args.next() {
|
|
Some(file) => conf.update_from_file(Path::new(&file)),
|
|
_ => {println!("Missing argument after '-f'"); std::process::exit(1);},
|
|
};
|
|
}
|
|
}
|
|
|
|
if !verify_directory(&conf.output_dir) {
|
|
println!("Output directory unusable, aborting");
|
|
std::process::exit(1);
|
|
}
|
|
if !verify_directory(&conf.job_dir) {
|
|
println!("Output job unusable, aborting");
|
|
std::process::exit(1);
|
|
}
|
|
|
|
let mut jobs: Vec<ThreadJob> = Vec::new();
|
|
|
|
// Load all jobs from job directory
|
|
let job_dir = conf.job_dir.clone();
|
|
for entry in std::fs::read_dir(job_dir).expect("Failed to iterate over job directory") {
|
|
let _entry = match entry {
|
|
Err(why) => {
|
|
println!("Skipping '{}': Error reading file in job directory", why);
|
|
continue;
|
|
},
|
|
Ok(value) => value,
|
|
};
|
|
let md = _entry.metadata().expect("Failed to read file metadata");
|
|
if !md.is_file() {
|
|
println!("Skipping '{}': not a file", _entry.path().display());
|
|
continue;
|
|
}
|
|
match _entry.path().extension() {
|
|
Some(x) => {
|
|
if ! "job".eq(x) {
|
|
println!("Skipping '{}': does not have '.job' extension",
|
|
_entry.path().display());
|
|
continue;
|
|
}
|
|
},
|
|
None => {
|
|
println!("Skipping '{}': does not have '.job' extension",
|
|
_entry.path().display());
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let job = match Job::from_file(&_entry.path(), &conf) {
|
|
Err(why) => {
|
|
println!("Failed to load job from '{}': {}", _entry.path().display(), why);
|
|
continue;
|
|
},
|
|
Ok(value) => value,
|
|
};
|
|
jobs.push(ThreadJob {
|
|
job: job,
|
|
handle: None,
|
|
last_result: None,
|
|
});
|
|
}
|
|
|
|
let max_running_tasks = 5;
|
|
loop {
|
|
let mut running_tasks = 0;
|
|
for tj in jobs.iter_mut().filter(|job| job.handle.is_some()) {
|
|
// Check if the task is done
|
|
if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
|
|
let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
|
|
tj.job.last_run = Some(Instant::now());
|
|
let source = tj.handle.take().unwrap().join();
|
|
if source.is_err() {
|
|
println!("Error for job '{}': {:?}", tj.job.url, source);
|
|
tj.handle = None;
|
|
continue;
|
|
}
|
|
let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
|
|
let selector = scraper::Selector::parse(&tj.job.selector.as_str()).expect("Failed to parse selector");
|
|
let mut result = String::from("");
|
|
for element in fragment.select(&selector) {
|
|
result.push_str(element.inner_html().as_str());
|
|
}
|
|
println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
|
|
if tj.last_result.is_none() {
|
|
println!("New result for job: '{}'\n", tj.job.url);
|
|
let diff = prettydiff::diff_lines(
|
|
"", result.as_str());
|
|
tj.job.update(result.as_str(), diff.to_string().as_str());
|
|
tj.last_result = Some(result);
|
|
}
|
|
else {
|
|
if tj.last_result.as_ref().unwrap().ne(&result) {
|
|
println!("Change detected for job '{}'", tj.job.url);
|
|
let diff = prettydiff::diff_lines(
|
|
tj.last_result.as_ref().unwrap(), result.as_str());
|
|
tj.job.update(result.as_str(), diff.to_string().as_str());
|
|
tj.last_result = Some(result);
|
|
}
|
|
}
|
|
tj.handle = None;
|
|
}
|
|
else if tj.handle.is_some() {
|
|
running_tasks += 1;
|
|
}
|
|
}
|
|
while running_tasks < max_running_tasks {
|
|
// Sort by least recently run
|
|
// According to the docs, unstable_by is preferred for speed +
|
|
// reduced memory allocations, but doesn't guarantee order of
|
|
// equal elements.
|
|
jobs.sort_unstable_by(|a, b| a.lru_not_running(b).unwrap());
|
|
for tj in jobs.iter_mut() {
|
|
let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
|
|
if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
|
|
let driver = conf.driver_url.clone();
|
|
let url = tj.job.url.clone();
|
|
tj.handle = Some(thread::spawn(move || {
|
|
return get_source(driver.as_str(), url.as_str());
|
|
}));
|
|
println!("Started thread for '{}'", tj.job.url);
|
|
tj.job.last_run = Some(Instant::now());
|
|
running_tasks += 1;
|
|
if running_tasks >= max_running_tasks {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
std::thread::sleep(Duration::new(1, 0));
|
|
}
|
|
}
|
|
|
|
fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
|
|
// The firefox driver seems to crash less often than the chrome driver.
|
|
let caps = thirtyfour_sync::DesiredCapabilities::firefox();
|
|
let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
|
|
driver.get(url).expect("failed to get url");
|
|
let source = driver.page_source().expect("failed to get page source");
|
|
driver.quit().expect("failed to close session");
|
|
return Ok(source);
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
use tempfile;
|
|
|
|
#[test]
|
|
fn verify_directory_exists_but_is_file() {
|
|
let tf = tempfile::NamedTempFile::new().unwrap();
|
|
assert!(!verify_directory(tf.path()));
|
|
}
|
|
|
|
#[test]
|
|
fn verify_directory_does_not_exist() {
|
|
assert!(!verify_directory(Path::new("/fake/path/that/does/not/exist")));
|
|
}
|
|
|
|
#[test]
|
|
fn verify_directory_exists() {
|
|
let tf = tempfile::tempdir().unwrap();
|
|
assert!(verify_directory(tf.path()));
|
|
}
|
|
|
|
#[test]
|
|
fn verify_directory_exists_but_is_not_writable() {
|
|
let tf = tempfile::tempdir().unwrap();
|
|
let md = std::fs::metadata(&tf.path()).unwrap();
|
|
let mut perms = md.permissions();
|
|
perms.set_readonly(true);
|
|
std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir read-only");
|
|
let result = verify_directory(tf.path());
|
|
perms = md.permissions();
|
|
perms.set_readonly(false);
|
|
std::fs::set_permissions(&tf.path(), perms).expect("Failed to set temp dir writable");
|
|
assert!(!result);
|
|
}
|
|
}
|