From a69fd14ea496dc2bcd9bcc87fd617021bbfae13d Mon Sep 17 00:00:00 2001 From: Kienan Stewart Date: Sat, 8 Oct 2022 20:39:55 -0400 Subject: [PATCH] Use text-diff to compare run results --- Cargo.lock | 69 +++++++++++++++++++++++++++++++++++++++++++---------- Cargo.toml | 1 + src/main.rs | 19 ++++++++++----- 3 files changed, 70 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a8d70d8..689c5e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,7 +27,7 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -74,7 +74,7 @@ checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi", "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -159,7 +159,7 @@ dependencies = [ "num-traits", "serde", "time", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -360,7 +360,7 @@ checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" dependencies = [ "libc", "redox_users", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -625,6 +625,7 @@ dependencies = [ "rss", "scraper", "tempfile", + "text-diff", "thirtyfour_sync", ] @@ -816,6 +817,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -1185,7 +1196,7 @@ dependencies = [ "csv", "encode_unicode", "lazy_static", - "term", + "term 0.5.2", "unicode-width", ] @@ -1383,7 +1394,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1468,7 +1479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" dependencies = [ "lazy_static", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1632,7 +1643,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1734,7 +1745,7 @@ dependencies = [ "libc", "redox_syscall 0.2.13", "remove_dir_all", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1748,6 +1759,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "term" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2077e54d38055cf1ca0fd7933a2e00cd3ec8f6fed352b2a377f06dcdaaf3281" +dependencies = [ + "kernel32-sys", + "winapi 0.2.8", +] + [[package]] name = "term" version = "0.5.2" @@ -1756,7 +1777,17 @@ checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" dependencies = [ "byteorder", "dirs", - "winapi", + "winapi 0.3.9", +] + +[[package]] +name = "text-diff" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "309238dd66f8bf11a20d015b727b926f294a13fcb8d56770bb984e7a22c43897" +dependencies = [ + "getopts", + "term 0.2.14", ] [[package]] @@ -1839,7 +1870,7 @@ checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" dependencies = [ "libc", "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1872,7 +1903,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -2133,6 +2164,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + [[package]] name = "winapi" version = "0.3.9" @@ -2143,6 +2180,12 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -2204,5 +2247,5 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ - "winapi", + "winapi 0.3.9", ] diff --git a/Cargo.toml b/Cargo.toml index 3160476..28f9326 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ chrono = "0.4" inotify = "0.10" prettydiff = "0.6" rss = "2" +text-diff = "0.4" scraper = "0.13" tempfile = "3" thirtyfour_sync = "0.27" diff --git a/src/main.rs b/src/main.rs index e30f606..cdb256f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,6 +14,8 @@ use conf::Conf; mod job; use job::Job; +use text_diff; + struct ThreadJob { job: Job, handle: Option< @@ -152,18 +154,23 @@ fn main() { println!("Job for '{}' took about {}s", tj.job.url, duration.as_secs()); if tj.last_result.is_none() { println!("New result for job: '{}'\n", tj.job.url); - let diff = prettydiff::diff_lines( - "", result.as_str()); - tj.job.update(result.as_str(), diff.to_string().as_str()); - tj.last_result = Some(result); + // Use scraper + fragments to attempt to "normalize" + let fragment = scraper::Html::parse_fragment(result.as_str()); + tj.last_result = Some(fragment.root_element().inner_html()); + tj.job.update(tj.last_result.as_ref().unwrap().as_str(), ""); } else { - if tj.last_result.as_ref().unwrap().ne(&result) { + let fragment = scraper::Html::parse_fragment(result.as_str()); + let normalized = fragment.root_element().inner_html(); + let (dist, _changeset) = text_diff::diff( + tj.last_result.as_ref().unwrap(), &normalized.as_str(), "" + ); + if dist != 0 { println!("Change detected for job '{}'", tj.job.url); let diff = prettydiff::diff_lines( tj.last_result.as_ref().unwrap(), result.as_str()); tj.job.update(result.as_str(), diff.to_string().as_str()); - tj.last_result = Some(result); + tj.last_result = Some(normalized); } } tj.handle = None;