Initial commit

2022-09-23 19:56:04 -04:00 · 2022-09-23 19:56:04 -04:00 · a6784fdce0
commit a6784fdce0
6 changed files with 1891 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /target
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
 [package]
 name = "haunter"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 thirtyfour_sync = "0.27"
 scraper = "0.13"
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 # Haunter
 Watches web sites for changes
--- a/container-compose.yml
+++ b/container-compose.yml
@ -0,0 +1,13 @@
 ---
 version: '3'
 services:
  driver:
    image: docker.io/selenium/standalone-chrome
    ports:
      - '4444:4444'
  # watcher:
  #   build_context:
  #     - ./
  #   depends_on:
  #     - driver
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,108 @@
 use std::option;
 use std::thread;
 use std::time::Duration;
 use std::time::Instant;
 use thirtyfour_sync::WebDriverCommands;
 struct Job<'a> {
    url: &'a str,
    selector: &'a str,
    every: Duration,
    last_run: Option<Instant>,
 }
 struct ThreadJob<'a> {
    job: Job<'a>,
    handle: Option<
            std::thread::JoinHandle<
                    Result<String, &'static str>
                    >
            >,
    last_result: Option<String>,
 }
 fn main() {
    let driver = "http://localhost:4444";
    let mut jobs = Vec::new();
    let some_job = Job {
        url: "https://www.rust-lang.org",
        selector: "a.download-link",
        every: Duration::new(60, 0),
        last_run: None,
    };
    let other_job = Job {
        url: "https://arstechnica.com/",
        selector: "li.split-feature:nth-child(1) > header:nth-child(4) > h2:nth-child(1) > a:nth-child(1)",
        every: Duration::new(120, 0),
        last_run: None,
    };
    jobs.push(ThreadJob {
        job: some_job,
        handle: None,
        last_result: None,
    });
    jobs.push(ThreadJob {
        job: other_job,
        handle: None,
        last_result: None,
    });
    // @BUG: It seems the selenium chrome driver can't handle concurrent sessions from
    // multiple threads. When the threads attempt to run concurrently, there are crashes,
    // eg.
    //
    // thread '<unnamed>' panicked at 'failed to get url: UnknownError(WebDriverErrorInfo { status: 500, error: "", value: WebDriverErrorValue { message: "unknown error: session deleted because of page crash\nfrom tab crashed\n  (Session info: chrome=105.0.5195.52)
    //
    // This should just run single jobs consecutively as a result.
    loop {
        for tj in jobs.iter_mut() {
            let should_run_by_time = tj.job.last_run.is_some() && Instant::now().duration_since(tj.job.last_run.unwrap()).ge(&tj.job.every);
            if tj.handle.is_none() && (should_run_by_time || tj.job.last_run.is_none()) {
                tj.handle = Some(thread::spawn(|| {
                    return get_source(driver, tj.job.url);
                }));
                println!("Started thread for '{}'", tj.job.url);
                tj.job.last_run = Some(Instant::now());
                continue;
            }
            if tj.handle.is_some() && tj.handle.as_ref().unwrap().is_finished() {
                let duration = Instant::now().duration_since(tj.job.last_run.unwrap());
                tj.job.last_run = Some(Instant::now());
                let source = tj.handle.take().unwrap().join();
                if source.is_err() {
                    println!("Error for job '{}': {:?}", tj.job.url, source);
                    tj.handle = None;
                    continue;
                }
                let fragment = scraper::Html::parse_document(source.unwrap().unwrap().as_str());
                let selector = scraper::Selector::parse(tj.job.selector).expect("Failed to parse selector");
                let mut result = String::from("");
                for element in fragment.select(&selector) {
                    result.push_str(element.inner_html().as_str());
                }
                println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs());
                if tj.last_result.is_none() {
                    println!("New result: '{}'\n", result);
                    tj.last_result = Some(result);
                }
                else {
                    if tj.last_result.as_ref().unwrap().ne(&result) {
                        println!("Change detected\nOld value: '{}'\nNew value: '{}'",
                                 tj.last_result.as_ref().unwrap(), result);
                        tj.last_result = Some(result);
                    }
                }
                tj.handle = None;
            }
        }
        std::thread::sleep(Duration::new(1, 0));
    }
 }
 fn get_source(driver: &str, url: &str) -> Result<String, &'static str> {
    let caps = thirtyfour_sync::DesiredCapabilities::chrome();
    let driver = thirtyfour_sync::WebDriver::new(driver, &caps).expect("failed to get driver");
    driver.get(url).expect("failed to get url");
    let source = driver.page_source().expect("failed to get page source");
    driver.quit().expect("failed to close session");
    return Ok(source);
 }