Skip anchor checking for URL with prefix in config (#812)

* cargo fmt & clippy * Skip anchor checking for URL with prefix in config
5 years ago · 6149fd17e1
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1241,6 +1241,7 @@ dependencies = [
 name = "link_checker"
 version = "0.1.0"
 dependencies = [
 "config 0.1.0",
 "errors 0.1.0",
 "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "reqwest 0.9.21 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/components/config/src/config.rs
+++ b/components/config/src/config.rs
@@ -7,8 +7,8 @@ use syntect::parsing::{SyntaxSet, SyntaxSetBuilder};
 use toml;
 use toml::Value as Toml;
 use errors::Result;
 use errors::Error;
 use errors::Result;
 use highlighting::THEME_SET;
 use theme::Theme;
 use utils::fs::read_file_with_error;
@@ -86,7 +86,20 @@ impl Default for Taxonomy {
    }
 }
 type TranslateTerm  = HashMap<String, String>;
 type TranslateTerm = HashMap<String, String>;
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(default)]
 pub struct LinkChecker {
    /// Skip anchor checking for these URL prefixes
    pub skip_anchor_prefixes: Vec<String>,
 }
 impl Default for LinkChecker {
    fn default() -> LinkChecker {
        LinkChecker { skip_anchor_prefixes: Vec::new() }
    }
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(default)]
@@ -152,6 +165,8 @@ pub struct Config {
    #[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are need
    pub extra_syntax_set: Option<SyntaxSet>,
    pub link_checker: LinkChecker,
    /// All user params set in [extra] in the config
    pub extra: HashMap<String, Toml>,
@@ -317,9 +332,16 @@ impl Config {
            Error::msg(format!("Translation for language '{}' is missing", lang.as_ref()))
        })?;
        terms.get(key.as_ref()).ok_or_else(|| {
            Error::msg(format!("Translation key '{}' for language '{}' is missing", key.as_ref(), lang.as_ref()))
        }).map(|term| term.to_string())
        terms
            .get(key.as_ref())
            .ok_or_else(|| {
                Error::msg(format!(
                    "Translation key '{}' for language '{}' is missing",
                    key.as_ref(),
                    lang.as_ref()
                ))
            })
            .map(|term| term.to_string())
    }
 }
@@ -346,6 +368,7 @@ impl Default for Config {
            translations: HashMap::new(),
            extra_syntaxes: Vec::new(),
            extra_syntax_set: None,
            link_checker: LinkChecker::default(),
            extra: HashMap::new(),
            build_timestamp: Some(1),
        }
@@ -551,4 +574,25 @@ ignored_content = ["*.{graphml,iso}", "*.py?"]
        assert!(g.is_match("foo.py3"));
        assert!(!g.is_match("foo.py"));
    }
    #[test]
    fn link_checker_skip_anchor_prefixes() {
        let config_str = r#"
 title = "My site"
 base_url = "example.com"
 [link_checker]
 skip_anchor_prefixes = [
    "https://caniuse.com/#feat=",
    "https://github.com/rust-lang/rust/blob/",
 ]
        "#;
        let config = Config::parse(config_str).unwrap();
        let v = config.link_checker.skip_anchor_prefixes;
        assert_eq!(
            v,
            vec!["https://caniuse.com/#feat=", "https://github.com/rust-lang/rust/blob/"]
        );
    }
 }
--- a/components/config/src/lib.rs
+++ b/components/config/src/lib.rs
@@ -14,7 +14,7 @@ extern crate utils;
 mod config;
 pub mod highlighting;
 mod theme;
 pub use config::{Config, Language, Taxonomy};
 pub use config::{Config, Language, LinkChecker, Taxonomy};
 use std::path::Path;
--- a/components/imageproc/src/lib.rs
+++ b/components/imageproc/src/lib.rs
@@ -272,7 +272,7 @@ impl ImageOp {
                } else {
                    img
                }
            },
            }
            Fill(w, h) => {
                let factor_w = img_w as f32 / w as f32;
                let factor_h = img_h as f32 / h as f32;
--- a/components/library/src/library.rs
+++ b/components/library/src/library.rs
@@ -1,7 +1,7 @@
 use std::collections::{HashMap, HashSet};
 use std::path::{Path, PathBuf};
 use slotmap::{DenseSlotMap, DefaultKey};
 use slotmap::{DefaultKey, DenseSlotMap};
 use front_matter::SortBy;
--- a/components/library/src/sorting.rs
+++ b/components/library/src/sorting.rs
@@ -21,7 +21,9 @@ pub fn sort_actual_pages_by_date(a: &&Page, b: &&Page) -> Ordering {
 /// Takes a list of (page key, date, permalink) and sort them by dates if possible
 /// Pages without date will be put in the unsortable bucket
 /// The permalink is used to break ties
 pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
 pub fn sort_pages_by_date(
    pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>,
 ) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
    let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
        pages.into_par_iter().partition(|page| page.1.is_some());
@@ -40,7 +42,9 @@ pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>
 /// Takes a list of (page key, weight, permalink) and sort them by weight if possible
 /// Pages without weight will be put in the unsortable bucket
 /// The permalink is used to break ties
 pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
 pub fn sort_pages_by_weight(
    pages: Vec<(&DefaultKey, Option<usize>, &str)>,
 ) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
    let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
        pages.into_par_iter().partition(|page| page.1.is_some());
@@ -57,7 +61,9 @@ pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (
 }
 /// Find the lighter/heavier and earlier/later pages for all pages having a date/weight
 pub fn find_siblings(sorted: &[DefaultKey]) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
 pub fn find_siblings(
    sorted: &[DefaultKey],
 ) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
    let mut res = Vec::with_capacity(sorted.len());
    let length = sorted.len();
--- a/components/link_checker/Cargo.toml
+++ b/components/link_checker/Cargo.toml
@@ -7,4 +7,5 @@ authors = ["Vincent Prouillet <prouillet.vincent@gmail.com>"]
 reqwest = "0.9"
 lazy_static = "1"
 config = { path = "../config" }
 errors = { path = "../errors" }
--- a/components/link_checker/src/lib.rs
+++ b/components/link_checker/src/lib.rs
@@ -2,11 +2,13 @@ extern crate reqwest;
 #[macro_use]
 extern crate lazy_static;
 extern crate config;
 extern crate errors;
 use reqwest::header::{HeaderMap, ACCEPT};
 use reqwest::StatusCode;
 use config::LinkChecker;
 use errors::Result;
 use std::collections::HashMap;
@@ -51,7 +53,7 @@ lazy_static! {
    static ref LINKS: Arc<RwLock<HashMap<String, LinkResult>>> = Arc::new(RwLock::new(HashMap::new()));
 }
 pub fn check_url(url: &str) -> LinkResult {
 pub fn check_url(url: &str, config: &LinkChecker) -> LinkResult {
    {
        let guard = LINKS.read().unwrap();
        if let Some(res) = guard.get(url) {
@@ -65,9 +67,11 @@ pub fn check_url(url: &str) -> LinkResult {
    let client = reqwest::Client::new();
    let check_anchor = !config.skip_anchor_prefixes.iter().any(|prefix| url.starts_with(prefix));
    // Need to actually do the link checking
    let res = match client.get(url).headers(headers).send() {
        Ok(ref mut response) if has_anchor(url) => {
        Ok(ref mut response) if check_anchor && has_anchor(url) => {
            match check_page_for_anchor(url, response.text()) {
                Ok(_) => LinkResult { code: Some(response.status()), error: None },
                Err(e) => LinkResult { code: None, error: Some(e.to_string()) },
@@ -111,21 +115,21 @@ fn check_page_for_anchor(url: &str, body: reqwest::Result<String>) -> Result<()>
 #[cfg(test)]
 mod tests {
    use super::{check_page_for_anchor, check_url, has_anchor, LINKS};
    use super::{check_page_for_anchor, check_url, has_anchor, LinkChecker, LINKS};
    #[test]
    fn can_validate_ok_links() {
        let url = "https://google.com";
        let res = check_url(url);
        let res = check_url(url, &LinkChecker::default());
        assert!(res.is_valid());
        assert!(LINKS.read().unwrap().get(url).is_some());
        let res = check_url(url);
        let res = check_url(url, &LinkChecker::default());
        assert!(res.is_valid());
    }
    #[test]
    fn can_fail_404_links() {
        let res = check_url("https://google.comys");
        let res = check_url("https://google.comys", &LinkChecker::default());
        assert_eq!(res.is_valid(), false);
        assert!(res.code.is_none());
        assert!(res.error.is_some());
@@ -190,4 +194,23 @@ mod tests {
        let res = has_anchor(url);
        assert_eq!(res, false);
    }
    #[test]
    fn skip_anchor_prefixes() {
        let config = LinkChecker {
            skip_anchor_prefixes: vec!["https://github.com/rust-lang/rust/blob/".to_owned()],
        };
        // anchor check is ignored because the url matches the prefix
        let permalink = "https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214";
        assert!(check_url(&permalink, &config).is_valid());
        // other anchors are checked
        let glossary = "https://help.github.com/en/articles/github-glossary#blame";
        assert!(check_url(&glossary, &config).is_valid());
        let glossary_invalid =
            "https://help.github.com/en/articles/github-glossary#anchor-does-not-exist";
        assert_eq!(check_url(&glossary_invalid, &config).is_valid(), false);
    }
 }
--- a/components/rebuild/src/lib.rs
+++ b/components/rebuild/src/lib.rs
@@ -335,7 +335,7 @@ fn is_section(path: &str, languages_codes: &[&str]) -> bool {
        }
    }
    return false;
    false
 }
 /// What happens when a section or a page is created/edited
--- a/components/rendering/src/markdown.rs
+++ b/components/rendering/src/markdown.rs
@@ -296,8 +296,9 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
            let start_idx = heading_ref.start_idx;
            let end_idx = heading_ref.end_idx;
            let title = get_text(&events[start_idx + 1..end_idx]);
            let id =
                heading_ref.id.unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
            let id = heading_ref
                .id
                .unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
            inserted_anchors.push(id.clone());
            // insert `id` to the tag
@@ -326,7 +327,8 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
            // record heading to make table of contents
            let permalink = format!("{}#{}", context.current_page_permalink, id);
            let h = Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
            let h =
                Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
            headings.push(h);
        }
--- a/components/site/src/lib.rs
+++ b/components/site/src/lib.rs
@@ -399,7 +399,7 @@ impl Site {
            all_links
                .par_iter()
                .filter_map(|(page_path, link)| {
                    let res = check_url(&link);
                    let res = check_url(&link, &self.config.link_checker);
                    if res.is_valid() {
                        None
                    } else {
--- a/components/site/tests/site.rs
+++ b/components/site/tests/site.rs
@@ -662,3 +662,14 @@ fn can_ignore_markdown_content() {
    let (_, _tmp_dir, public) = build_site("test_site");
    assert!(!file_exists!(public, "posts/ignored/index.html"));
 }
 #[test]
 fn check_site() {
    let (mut site, _tmp_dir, _public) = build_site("test_site");
    let prefixes = &site.config.link_checker.skip_anchor_prefixes;
    assert_eq!(prefixes, &vec!["https://github.com/rust-lang/rust/blob/"]);
    site.config.enable_check_mode();
    site.load().expect("link check test_site");
 }
--- a/components/templates/src/global_fns/mod.rs
+++ b/components/templates/src/global_fns/mod.rs
@@ -34,9 +34,10 @@ impl TeraFn for Trans {
        let lang = optional_arg!(String, args.get("lang"), "`trans`: `lang` must be a string.")
            .unwrap_or_else(|| self.config.default_language.clone());
        let term = self.config.get_translation(lang, key).map_err(|e| {
            Error::chain("Failed to retreive term translation", e)
        })?;
        let term = self
            .config
            .get_translation(lang, key)
            .map_err(|e| Error::chain("Failed to retreive term translation", e))?;
        Ok(to_value(term).unwrap())
    }
@@ -509,7 +510,6 @@ mod tests {
        assert!(static_fn.call(&args).is_err());
    }
    const TRANS_CONFIG: &str = r#"
 base_url = "https://remplace-par-ton-url.fr"
 default_language = "fr"
--- a/docs/content/documentation/getting-started/configuration.md
+++ b/docs/content/documentation/getting-started/configuration.md
@@ -95,8 +95,14 @@ extra_syntaxes = []
 #
 #     [translations.en]
 #     title = "A title"
 #
 [translations]
 # Configure the link checker
 [link_checker]
 # Skip anchor checking for external URLs that start with these prefixes
 skip_anchor_prefixes = [
    "https://caniuse.com/",
 ]
 # You can put any kind of data in there and it
 # will be accessible in all templates
--- a/test_site/config.toml
+++ b/test_site/config.toml
@@ -13,5 +13,10 @@ extra_syntaxes = ["syntaxes"]
 ignored_content = ["*/ignored.md"]
 [link_checker]
 skip_anchor_prefixes = [
    "https://github.com/rust-lang/rust/blob/",
 ]
 [extra.author]
 name = "Vincent Prouillet"
--- a/test_site/content/posts/tutorials/programming/rust.md
+++ b/test_site/content/posts/tutorials/programming/rust.md
@@ -5,3 +5,9 @@ date = 2017-01-01
 +++
 A simple page
 <!-- more -->
 Link to some rust-lang [source code][permalink].
 [permalink]: https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214