Skip to main content

lychee_lib/utils/
request.rs

1use log::warn;
2use reqwest::Url;
3use std::path::Path;
4use url::PathSegmentsMut;
5
6use crate::{
7    BaseInfo, BasicAuthCredentials, LycheeResult, Request, RequestError, Uri,
8    basic_auth::BasicAuthExtractor,
9    types::{ResolvedInputSource, uri::raw::RawUri},
10};
11
12/// Extract basic auth credentials for a given URL.
13pub(crate) fn extract_credentials(
14    extractor: Option<&BasicAuthExtractor>,
15    uri: &Uri,
16) -> Option<BasicAuthCredentials> {
17    extractor.as_ref().and_then(|ext| ext.matches(uri))
18}
19
20/// Create a request from a raw URI.
21fn create_request(
22    raw_uri: &RawUri,
23    source: &ResolvedInputSource,
24    root_dir: Option<&Path>,
25    base: &BaseInfo,
26    extractor: Option<&BasicAuthExtractor>,
27) -> LycheeResult<Request> {
28    let uri = try_parse_into_uri(raw_uri, root_dir, base)?;
29    let source = source.clone();
30    let element = raw_uri.element.clone();
31    let attribute = raw_uri.attribute.clone();
32    let span = Some(raw_uri.span);
33    let credentials = extract_credentials(extractor, &uri);
34
35    Ok(Request {
36        uri,
37        source,
38        element,
39        attribute,
40        span,
41        credentials,
42    })
43}
44
45/// Try to parse the raw URI into a `Uri`.
46///
47/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
48/// If the base URL is not available, create a URI from the file path.
49///
50/// # Errors
51///
52/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
53///   to create a valid URI.
54/// - If a URI cannot be created from the file path.
55/// - If the source is not a file path (i.e. the URI type is not supported).
56fn try_parse_into_uri(
57    raw_uri: &RawUri,
58    root_dir: Option<&Path>,
59    base: &BaseInfo,
60) -> LycheeResult<Uri> {
61    // TODO: this conversion should be hoisted up the call stack
62    let root_dir = root_dir.and_then(|x| Url::from_directory_path(x).ok());
63
64    let mut url = base.parse_url_text_with_root_dir(&raw_uri.text, root_dir.as_ref())?;
65
66    // BACKWARDS COMPAT: delete trailing slash for file urls
67    // Without this, then a local link like `README.md/` will fail.
68    if url.scheme() == "file" {
69        if url.path() != "/" && url.path().ends_with('/') {
70            warn!(
71                "Removing trailing slash from file URL: {url}. {} {}",
72                "This lets the URL match both files and folders.",
73                "In future, a file URL ending in / might fail link checking if it points to a file."
74            );
75        }
76        let _ = url
77            .path_segments_mut()
78            .as_mut()
79            .map(PathSegmentsMut::pop_if_empty);
80    }
81
82    Ok(url.into())
83}
84
85/// Create requests out of the collected URLs. Returns a vector of valid URLs
86/// and errors. URLs are not deduplicated because repeated URLs may occur at
87/// different source locations.
88///
89/// If a URLs is ignored (because of the current settings),
90/// it will not be added to the results.
91pub(crate) fn create(
92    uris: Vec<RawUri>,
93    source: &ResolvedInputSource,
94    root_dir: Option<&Path>,
95    fallback_base: &BaseInfo,
96    extractor: Option<&BasicAuthExtractor>,
97) -> Vec<Result<Request, RequestError>> {
98    let source_base = match source.to_base_info() {
99        Ok(base) => base,
100        Err(e) => {
101            // This should be extremely rare and only happens
102            // if a FsPath leads to an invalid URL.
103            return vec![Err(RequestError::InputSourceError(
104                source.clone().into(),
105                e,
106            ))];
107        }
108    };
109
110    // TODO: use_fs_root_as_origin is for backwards compat, so `--base-url file:///a`
111    // can resolve a link of `/b` to `file:///b` (in the absence of root-dir).
112    // maybe change if base-url semantics are changed in future.
113    let fallback_base = fallback_base.use_fs_root_as_origin();
114    let base = source_base.or_fallback(&fallback_base);
115
116    let mut vec = vec![];
117
118    for raw_uri in uris {
119        let result = create_request(&raw_uri, source, root_dir, base, extractor);
120        match result {
121            Ok(request) => {
122                vec.push(Ok(request));
123            }
124            Err(e) => vec.push(Err(RequestError::CreateRequestItem(
125                raw_uri.clone(),
126                source.clone(),
127                e,
128            ))),
129        }
130    }
131
132    vec
133}
134
135#[cfg(test)]
136mod tests {
137    use std::borrow::Cow;
138    use std::num::NonZeroUsize;
139    use std::path::PathBuf;
140
141    use crate::Request;
142    use crate::types::uri::raw::{RawUri, RawUriSpan};
143
144    use super::*;
145
146    const SPAN: RawUriSpan = RawUriSpan {
147        line: NonZeroUsize::MIN,
148        column: Some(NonZeroUsize::MIN),
149    };
150
151    /// Create requests from the given raw URIs and returns requests that were
152    /// constructed successfully, silently ignoring link parsing errors.
153    ///
154    /// This reduces the `Result` handling which is needed in test cases. Test
155    /// cases can still detect the unexpected appearance of errors by the
156    /// length being different.
157    fn create_ok_only(
158        uris: Vec<RawUri>,
159        source: &ResolvedInputSource,
160        root_dir: Option<&Path>,
161        base: &BaseInfo,
162        extractor: Option<&BasicAuthExtractor>,
163    ) -> Vec<Request> {
164        create(uris, source, root_dir, base, extractor)
165            .into_iter()
166            .filter_map(Result::ok)
167            .collect()
168    }
169
170    fn raw_uri(text: &'static str) -> RawUri {
171        RawUri {
172            text: text.to_string(),
173            element: None,
174            attribute: None,
175            span: SPAN,
176        }
177    }
178
179    #[test]
180    fn test_relative_url_resolution() {
181        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
182        let source = ResolvedInputSource::String(Cow::Borrowed(""));
183
184        let uris = vec![raw_uri("relative.html")];
185        let requests = create_ok_only(uris, &source, None, &base, None);
186
187        assert_eq!(requests.len(), 1);
188        assert!(
189            requests
190                .iter()
191                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
192        );
193    }
194
195    #[test]
196    fn test_absolute_url_resolution() {
197        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
198        let source = ResolvedInputSource::String(Cow::Borrowed(""));
199
200        let uris = vec![raw_uri("https://another.com/page")];
201        let requests = create_ok_only(uris, &source, None, &base, None);
202
203        assert_eq!(requests.len(), 1);
204        assert!(
205            requests
206                .iter()
207                .any(|r| r.uri.url.as_str() == "https://another.com/page")
208        );
209    }
210
211    #[test]
212    fn test_root_relative_url_resolution() {
213        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
214        let source = ResolvedInputSource::String(Cow::Borrowed(""));
215
216        let uris = vec![raw_uri("/root-relative")];
217        let requests = create_ok_only(uris, &source, None, &base, None);
218
219        assert_eq!(requests.len(), 1);
220        assert!(
221            requests
222                .iter()
223                .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")
224        );
225    }
226
227    #[test]
228    fn test_parent_directory_url_resolution() {
229        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
230        let source = ResolvedInputSource::String(Cow::Borrowed(""));
231
232        let uris = vec![raw_uri("../parent")];
233        let requests = create_ok_only(uris, &source, None, &base, None);
234
235        assert_eq!(requests.len(), 1);
236        assert!(
237            requests
238                .iter()
239                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
240        );
241    }
242
243    #[test]
244    fn test_fragment_url_resolution() {
245        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
246        let source = ResolvedInputSource::String(Cow::Borrowed(""));
247
248        let uris = vec![raw_uri("#fragment")];
249        let requests = create_ok_only(uris, &source, None, &base, None);
250
251        assert_eq!(requests.len(), 1);
252        assert!(
253            requests
254                .iter()
255                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
256        );
257    }
258
259    #[test]
260    fn test_relative_url_resolution_from_root_dir() {
261        let root_dir = PathBuf::from("/tmp/lychee");
262        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
263
264        let uris = vec![raw_uri("relative.html")];
265        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
266
267        assert_eq!(requests.len(), 1);
268        assert!(
269            requests
270                .iter()
271                .any(|r| r.uri.url.as_str() == "file:///some/relative.html")
272        );
273    }
274
275    #[test]
276    fn test_absolute_url_resolution_from_root_dir() {
277        let root_dir = PathBuf::from("/tmp/lychee");
278        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
279
280        let uris = vec![raw_uri("https://another.com/page")];
281        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
282
283        assert_eq!(requests.len(), 1);
284        assert!(
285            requests
286                .iter()
287                .any(|r| r.uri.url.as_str() == "https://another.com/page")
288        );
289    }
290
291    #[test]
292    fn test_root_relative_url_resolution_from_root_dir() {
293        let root_dir = PathBuf::from("/tmp/lychee");
294        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
295
296        let uris = vec![raw_uri("/root-relative")];
297        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
298
299        assert_eq!(requests.len(), 1);
300        assert!(
301            requests
302                .iter()
303                .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/root-relative")
304        );
305    }
306
307    #[test]
308    fn test_parent_directory_url_resolution_from_root_dir() {
309        let root_dir = PathBuf::from("/tmp/lychee");
310        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
311
312        let uris = vec![raw_uri("../parent")];
313        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
314
315        assert_eq!(requests.len(), 1);
316        assert!(
317            requests
318                .iter()
319                .any(|r| r.uri.url.as_str() == "file:///parent")
320        );
321    }
322
323    #[test]
324    fn test_fragment_url_resolution_from_root_dir() {
325        let root_dir = PathBuf::from("/tmp/lychee");
326        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
327
328        let uris = vec![raw_uri("#fragment")];
329        let requests = create_ok_only(uris, &source, Some(&root_dir), &BaseInfo::none(), None);
330
331        assert_eq!(requests.len(), 1);
332        assert!(
333            requests
334                .iter()
335                .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment")
336        );
337    }
338
339    #[test]
340    fn test_relative_url_resolution_from_root_dir_and_base_url() {
341        let root_dir = PathBuf::from("/tmp/lychee");
342        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
343        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
344
345        let uris = vec![raw_uri("relative.html")];
346        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
347
348        assert_eq!(requests.len(), 1);
349        assert!(
350            requests
351                .iter()
352                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
353        );
354    }
355
356    #[test]
357    fn test_absolute_url_resolution_from_root_dir_and_base_url() {
358        let root_dir = PathBuf::from("/tmp/lychee");
359        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
360        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
361
362        let uris = vec![raw_uri("https://another.com/page")];
363        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
364
365        assert_eq!(requests.len(), 1);
366        assert!(
367            requests
368                .iter()
369                .any(|r| r.uri.url.as_str() == "https://another.com/page")
370        );
371    }
372
373    #[test]
374    fn test_root_relative_url_resolution_from_root_dir_and_base_url() {
375        let root_dir = PathBuf::from("/tmp/lychee");
376        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
377        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
378
379        let uris = vec![raw_uri("/root-relative")];
380        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
381
382        assert_eq!(requests.len(), 1);
383        assert!(
384            requests
385                .iter()
386                .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")
387        );
388    }
389
390    #[test]
391    fn test_parent_directory_url_resolution_from_root_dir_and_base_url() {
392        let root_dir = PathBuf::from("/tmp/lychee");
393        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
394        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
395
396        let uris = vec![raw_uri("../parent")];
397        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
398
399        assert_eq!(requests.len(), 1);
400        assert!(
401            requests
402                .iter()
403                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
404        );
405    }
406
407    #[test]
408    fn test_fragment_url_resolution_from_root_dir_and_base_url() {
409        let root_dir = PathBuf::from("/tmp/lychee");
410        let base = BaseInfo::try_from("https://example.com/path/page.html").unwrap();
411        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
412
413        let uris = vec![raw_uri("#fragment")];
414        let requests = create_ok_only(uris, &source, Some(&root_dir), &base, None);
415
416        assert_eq!(requests.len(), 1);
417        assert!(
418            requests
419                .iter()
420                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
421        );
422    }
423
424    #[test]
425    fn test_no_base_url_resolution() {
426        let source = ResolvedInputSource::String(Cow::Borrowed(""));
427
428        let uris = vec![raw_uri("https://example.com/page")];
429        let requests = create_ok_only(uris, &source, None, &BaseInfo::none(), None);
430
431        assert_eq!(requests.len(), 1);
432        assert!(
433            requests
434                .iter()
435                .any(|r| r.uri.url.as_str() == "https://example.com/page")
436        );
437    }
438
439    #[test]
440    fn test_create_request_from_relative_file_path() {
441        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
442        let input_source = ResolvedInputSource::FsPath(PathBuf::from("page.html"));
443
444        let actual =
445            create_request(&raw_uri("file.html"), &input_source, None, &base, None).unwrap();
446
447        assert_eq!(
448            actual,
449            Request::new(
450                Uri {
451                    url: Url::from_file_path("/tmp/lychee/file.html").unwrap(),
452                },
453                input_source,
454            )
455            .with_span(SPAN)
456        );
457    }
458
459    #[test]
460    fn test_create_request_from_relative_file_path_errors() {
461        // relative links unsupported from stdin
462        assert!(
463            create_request(
464                &raw_uri("file.html"),
465                &ResolvedInputSource::Stdin,
466                None,
467                &BaseInfo::none(),
468                None,
469            )
470            .is_err()
471        );
472
473        // error because no root-dir and no base-url
474        assert!(
475            create_request(
476                &raw_uri("/file.html"),
477                &ResolvedInputSource::FsPath(PathBuf::from("page.html")),
478                None,
479                &BaseInfo::none(),
480                None,
481            )
482            .is_err()
483        );
484    }
485
486    #[test]
487    fn test_create_request_from_absolute_file_path() {
488        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
489        let input_source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html"));
490
491        // Use an absolute path that's outside the base directory
492        let actual = create_request(
493            &raw_uri("/usr/local/share/doc/example.html"),
494            &input_source,
495            None,
496            &base,
497            None,
498        )
499        .unwrap();
500
501        assert_eq!(
502            actual,
503            Request::new(
504                Uri {
505                    url: Url::from_file_path("/tmp/lychee/usr/local/share/doc/example.html")
506                        .unwrap(),
507                },
508                input_source,
509            )
510            .with_span(SPAN)
511        );
512    }
513
514    #[test]
515    fn test_parse_relative_path_into_uri() {
516        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
517
518        let raw_uri = raw_uri("relative.html");
519        let uri = try_parse_into_uri(&raw_uri, None, &base).unwrap();
520
521        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
522    }
523
524    #[test]
525    fn test_parse_absolute_path_into_uri() {
526        let base = BaseInfo::from_path(&PathBuf::from("/tmp/lychee")).unwrap();
527
528        let raw_uri = raw_uri("absolute.html");
529        let uri = try_parse_into_uri(&raw_uri, None, &base).unwrap();
530
531        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
532    }
533}