lychee_lib/types/input/source.rs
1//! Input source type definitions.
2//!
3//! lychee can handle different kinds of input sources:
4//! - URLs (of HTTP/HTTPS scheme)
5//! - File system paths (to files or directories)
6//! - Unix shell-style glob patterns (e.g. `./docs/**/*.md`)
7//! - Standard input (`stdin`)
8//! - Raw strings (UTF-8 only for now)
9//!
10//! Each input source is handled differently:
11//! - File paths are walked (if they are directories) and filtered by
12//! extension
13//! - Glob patterns are expanded to matching file paths, which are then walked
14//! and filtered by extension
15//! - URLs, raw strings, and standard input (`stdin`) are read directly
16
17use crate::BaseInfo;
18use crate::ErrorKind;
19
20use glob::Pattern;
21use reqwest::Url;
22use serde::{Deserialize, Deserializer, Serialize};
23use std::borrow::Cow;
24use std::fmt::Display;
25use std::path::PathBuf;
26use std::result::Result;
27
28/// Input types which lychee supports
29#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
30#[non_exhaustive]
31pub enum InputSource {
32 /// URL (of HTTP/HTTPS scheme).
33 RemoteUrl(Box<Url>),
34 /// Unix shell-style glob pattern.
35 FsGlob {
36 /// The glob pattern matching all input files
37 #[serde(deserialize_with = "InputSource::deserialize_pattern")]
38 pattern: Pattern,
39 /// Don't be case sensitive when matching files against a glob pattern
40 ignore_case: bool,
41 },
42 /// File path.
43 FsPath(PathBuf),
44 /// Standard Input.
45 Stdin,
46 /// Raw string input.
47 String(Cow<'static, str>),
48}
49
50impl InputSource {
51 const STDIN: &str = "-";
52
53 /// Parses a [`InputSource`] from the given string. The kind of input source will be
54 /// automatically detected according to certain rules and precedences.
55 ///
56 /// # Errors
57 ///
58 /// Returns an error if:
59 /// - the input does not exist (i.e. the path is invalid)
60 /// - the input cannot be parsed as a URL
61 pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
62 if input == Self::STDIN {
63 return Ok(InputSource::Stdin);
64 }
65
66 // We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
67 if let Ok(url) = Url::parse(input) {
68 // Weed out non-HTTP schemes, including Windows drive
69 // specifiers, which can be parsed by the
70 // [url](https://crates.io/crates/url) crate
71 return match url.scheme() {
72 "http" | "https" => Ok(InputSource::RemoteUrl(Box::new(url))),
73 _ => Err(ErrorKind::InvalidFile(PathBuf::from(input))),
74 };
75 }
76
77 // This seems to be the only way to determine if this is a glob pattern
78 let is_glob = glob::Pattern::escape(input) != input;
79
80 if is_glob {
81 return Ok(InputSource::FsGlob {
82 pattern: Pattern::new(input)?,
83 ignore_case: glob_ignore_case,
84 });
85 }
86
87 // It might be a file path; check if it exists
88 let path = PathBuf::from(input);
89
90 // On Windows, a filepath can never be mistaken for a
91 // URL, because Windows filepaths use `\` and URLs use
92 // `/`
93 #[cfg(windows)]
94 if path.exists() {
95 // The file exists, so we return the path
96 Ok(InputSource::FsPath(path))
97 } else {
98 // We have a valid filepath, but the file does not
99 // exist so we return an error
100 Err(ErrorKind::InvalidFile(path))
101 }
102
103 #[cfg(unix)]
104 if path.exists() {
105 Ok(InputSource::FsPath(path))
106 } else if input.starts_with('~') || input.starts_with('.') {
107 // The path is not valid, but it might still be a
108 // valid URL.
109 //
110 // Check if the path starts with a tilde (`~`) or a
111 // dot and exit early if it does.
112 //
113 // This check might not be sufficient to cover all cases
114 // but it catches the most common ones
115 Err(ErrorKind::InvalidFile(path))
116 } else {
117 // Invalid path; check if a valid URL can be constructed from the input
118 // by prefixing it with a `http://` scheme.
119 //
120 // Curl also uses http (i.e. not https), see
121 // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
122 //
123 // TODO: We should get rid of this heuristic and
124 // require users to provide a full URL with scheme.
125 // This is a big source of confusion to users.
126 let url = Url::parse(&format!("http://{input}"))
127 .map_err(|e| ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string()))?;
128 Ok(InputSource::RemoteUrl(Box::new(url)))
129 }
130 }
131
132 fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
133 where
134 D: Deserializer<'de>,
135 {
136 use serde::de::Error;
137 let s = String::deserialize(deserializer)?;
138 Pattern::new(&s).map_err(D::Error::custom)
139 }
140}
141
142/// Resolved input sources that can be processed for content.
143///
144/// This represents input sources after glob pattern expansion.
145/// It is identical to `InputSource`, except that glob patterns
146/// have been resolved to concrete file paths.
147///
148/// We use a separate type to avoid handling the (no longer applicable)
149/// glob case in downstream processing.
150#[derive(Debug, Clone, PartialEq, Eq, Hash)]
151pub enum ResolvedInputSource {
152 /// URL (of HTTP/HTTPS scheme).
153 RemoteUrl(Box<Url>),
154 /// File path.
155 FsPath(PathBuf),
156 /// Standard Input.
157 Stdin,
158 /// Raw string input.
159 String(Cow<'static, str>),
160}
161
162impl ResolvedInputSource {
163 /// Converts a [`ResolvedInputSource::RemoteUrl`] or
164 /// [`ResolvedInputSource::FsPath`] to a [`BaseInfo`] for the source.
165 ///
166 /// For other variants (i.e., those without a URL), [`BaseInfo::None`]
167 /// is returned.
168 ///
169 /// # Errors
170 ///
171 /// Returns an error if building a URL from a [`ResolvedInputSource::FsPath`]
172 /// fails.
173 pub fn to_base_info(&self) -> Result<BaseInfo, ErrorKind> {
174 let url = match self {
175 Self::RemoteUrl(url) => Cow::Borrowed(&**url),
176 Self::FsPath(path) => std::path::absolute(path)
177 .ok()
178 .and_then(|x| Url::from_file_path(x).ok())
179 .map(Cow::Owned)
180 .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned()))?,
181 _ => return Ok(BaseInfo::none()),
182 };
183
184 Ok(BaseInfo::from_source_url(&url))
185 }
186}
187
188impl From<ResolvedInputSource> for InputSource {
189 fn from(resolved: ResolvedInputSource) -> Self {
190 match resolved {
191 ResolvedInputSource::RemoteUrl(url) => InputSource::RemoteUrl(url),
192 ResolvedInputSource::FsPath(path) => InputSource::FsPath(path),
193 ResolvedInputSource::Stdin => InputSource::Stdin,
194 ResolvedInputSource::String(s) => InputSource::String(s),
195 }
196 }
197}
198
199impl Display for ResolvedInputSource {
200 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
201 f.write_str(match self {
202 Self::RemoteUrl(url) => url.as_str(),
203 Self::FsPath(path) => path.to_str().unwrap_or_default(),
204 Self::Stdin => "stdin",
205 Self::String(s) => s.as_ref(),
206 })
207 }
208}
209
210/// Custom serialization for the `InputSource` enum.
211///
212/// This implementation serializes all variants as strings to ensure
213/// compatibility with JSON serialization, which requires string keys for enums.
214///
215/// Without this custom implementation, attempting to serialize `InputSource` to
216/// JSON would result in a "key must be a string" error.
217///
218/// See: <https://github.com/serde-rs/json/issues/45>
219impl Serialize for InputSource {
220 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
221 where
222 S: serde::Serializer,
223 {
224 serializer.collect_str(self)
225 }
226}
227
228impl Display for InputSource {
229 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
230 f.write_str(match self {
231 Self::RemoteUrl(url) => url.as_str(),
232 Self::FsGlob { pattern, .. } => pattern.as_str(),
233 Self::FsPath(path) => path.to_str().unwrap_or_default(),
234 Self::Stdin => "stdin",
235 Self::String(s) => s.as_ref(),
236 })
237 }
238}
239
240#[cfg(test)]
241mod tests {
242 use super::*;
243
244 /// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
245 /// Here, we check that the `to_string` works as we require.
246 #[test]
247 fn test_pattern_serialization_is_original_pattern() {
248 let pat = "asd[f]*";
249 assert_eq!(
250 serde_json::to_string(&InputSource::FsGlob {
251 pattern: Pattern::new(pat).unwrap(),
252 ignore_case: false,
253 })
254 .unwrap(),
255 serde_json::to_string(pat).unwrap(),
256 );
257 }
258}