libscilo/config/instantiated_config.rs
1//! Creating and using the internal representation of all the important files, directories, and lints to check.
2
3use regex::Regex;
4use std::path::Path;
5use std::str::FromStr;
6use std::{path::PathBuf, sync::LazyLock};
7use strum::IntoEnumIterator;
8
9use super::file::ConfigFileRootDirs;
10use super::{error::ConfigError, file::ConfigFile};
11use crate::config::parse::parse_config_file;
12use crate::lints::LintCheck;
13use crate::lints::LintError;
14use crate::version_control::project_root;
15
16/// The default regular expression for [`code`][RootDirs::code]
17/// and [`results`][RootDirs::results] folders to match against.
18/// The format is `YYYY-MM-DD_kebab-case-brief-description`.
19static CODE_RESULTS_SUBDIR_REGEX: LazyLock<Regex> =
20 LazyLock::new(|| Regex::new(r"2\d{3}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])_[A-Za-z-]+").unwrap());
21
22/// The default accepted file names of the READMEs that should be present in the
23/// project root and within each subdirectory in the [`code`][RootDirs::code]
24/// and [`data`][RootDirs::data] directories.
25static README_FILE_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| {
26 vec![
27 String::from("README.md"),
28 String::from("README.org"),
29 String::from("README.txt"),
30 String::from("README"),
31 ]
32});
33
34/// The default accepted file names of the workflow files that should be present
35/// within each subdirectory in the [`code`][RootDirs::code] directory.
36static WORKFLOW_FILE_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| {
37 vec![
38 // Snakemake
39 String::from("Snakefile"),
40 // Targets
41 String::from("_targets.R"),
42 // Cromwell, Common Workflow Language
43 String::from("main.cwl"),
44 // Nextflow
45 String::from("main.nf"),
46 // BioPipe
47 String::from("main.pipe"),
48 // Guix Workflow Language
49 String::from("main.w"),
50 // Workflow Description Language
51 String::from("main.wdl"),
52 // Make
53 String::from("Makefile"),
54 ]
55});
56
57/// The internal representation of all the important files, directories, and lints to check.
58///
59/// This data structure isn't created directly from a configuration file.
60/// It is instead instantiated from the [`ConfigFile`] struct, using some intermediate
61/// logic that can fail along the way.
62#[derive(Clone, Debug)]
63pub struct InstantiatedConfig {
64 /// Expected top-level directories to organize the project folder.
65 pub root_dirs: RootDirs,
66
67 /// Regular expression for units of analysis within the code and their matching results.
68 ///
69 /// The default value for this regular expression is [`CODE_RESULTS_SUBDIR_REGEX`].
70 pub code_results_subdir_regex: Option<Regex>,
71
72 /// The explicit list of lint checks that should be performed.
73 /// See [`LintCheck`] for the complete list.
74 pub lints: Vec<LintCheck>,
75
76 /// The explicit list of files that should be present at the root of the
77 /// project directory.
78 pub root_files: Vec<PathBuf>,
79
80 /// The file name for the READMEs that should be present within each
81 /// subdirectory in the [`code`][RootDirs::code] and [`data`][RootDirs::data]
82 /// directories.
83 pub readme_names: Option<Vec<String>>,
84
85 /// The file name for the workflow file that should be present within each
86 /// subdirectory in the [`code`][RootDirs::code] directory.
87 pub workflow_names: Option<Vec<String>>,
88}
89
90impl Default for InstantiatedConfig {
91 fn default() -> Self {
92 Self {
93 root_dirs: RootDirs::default(),
94 code_results_subdir_regex: Some((*CODE_RESULTS_SUBDIR_REGEX).clone()),
95 // by default, use all lints
96 lints: LintCheck::iter().collect(),
97 root_files: Vec::new(),
98 readme_names: Some((*README_FILE_NAMES).clone()),
99 workflow_names: Some((*WORKFLOW_FILE_NAMES).clone()),
100 }
101 }
102}
103
104impl TryFrom<&ConfigFile> for InstantiatedConfig {
105 type Error = ConfigError;
106
107 fn try_from(value: &ConfigFile) -> Result<Self, Self::Error> {
108 // use the defaults if a non-required option is specified in the ConfigFile
109 let defaults = Self::default();
110
111 let root_dirs = RootDirs::from(value.root_dirs.clone());
112
113 // this complicated iteration and mapping will automatically return a `ConfigError`
114 // if there is a typo or other misspecification in the configuration file.
115 let lints = match &value.lints {
116 Some(lint_list) => {
117 // map the string to the enum variant using the `strum` crate
118 // and map a `strum::ParseError` into a `ConfigError` if there is an
119 // error in the string conversion.
120 lint_list
121 .iter()
122 .map(|lint_code| {
123 LintCheck::from_str(lint_code)
124 .map_err(|_| ConfigError::UnknownLintCode(lint_code.clone()))
125 })
126 .collect::<Result<Vec<_>, ConfigError>>()?
127 }
128 None => defaults.lints,
129 };
130
131 let root_files = match &value.root_files {
132 Some(files) => files.iter().map(PathBuf::from).collect(),
133 None => defaults.root_files,
134 };
135
136 // this regex parsing might fail, which means that we need to use a
137 // `TryFrom` instead of a `From`
138 let code_results_subdir_regex = match &value.code_results_subdir_regex {
139 Some(str) => match Regex::new(str) {
140 Ok(re) => Some(re),
141 Err(_) => {
142 return Err(ConfigError::RegexError {
143 name: "code_results_subdir_regex".to_string(),
144 value: str.clone(),
145 });
146 }
147 },
148 None => defaults.code_results_subdir_regex,
149 };
150
151 let readme_names = match &value.readme_names {
152 Some(val) => Some(val.clone()),
153 None => defaults.readme_names.clone(),
154 };
155
156 let workflow_names = match &value.workflow_names {
157 Some(val) => Some(val.clone()),
158 None => defaults.workflow_names.clone(),
159 };
160
161 Ok(Self {
162 root_dirs,
163 code_results_subdir_regex,
164 lints,
165 root_files,
166 readme_names,
167 workflow_names,
168 })
169 }
170}
171
172impl TryFrom<ConfigFile> for InstantiatedConfig {
173 type Error = ConfigError;
174
175 fn try_from(value: ConfigFile) -> Result<Self, Self::Error> {
176 Self::try_from(&value)
177 }
178}
179
180impl TryFrom<&Path> for InstantiatedConfig {
181 type Error = ConfigError;
182
183 fn try_from(value: &Path) -> Result<Self, Self::Error> {
184 let cfg_file = parse_config_file(value)?;
185 InstantiatedConfig::try_from(cfg_file)
186 }
187}
188
189impl TryFrom<&PathBuf> for InstantiatedConfig {
190 type Error = ConfigError;
191
192 fn try_from(value: &PathBuf) -> Result<Self, Self::Error> {
193 let cfg_file = parse_config_file(value.as_path())?;
194 InstantiatedConfig::try_from(cfg_file)
195 }
196}
197
198impl TryFrom<PathBuf> for InstantiatedConfig {
199 type Error = ConfigError;
200
201 fn try_from(value: PathBuf) -> Result<Self, Self::Error> {
202 Self::try_from(&value)
203 }
204}
205
206impl InstantiatedConfig {
207 /// The main method to run all the requested lints on the project directory.
208 pub fn execute_lints(self) -> Result<(), LintError> {
209 for lint in self.lints.iter() {
210 lint.check(&self)?;
211 }
212 Ok(())
213 }
214}
215
216/// Expected top-level directories to organize the project folder.
217#[derive(Clone, Debug)]
218pub struct RootDirs {
219 /// Top-level path for code.
220 ///
221 /// This folder contains all the code used to process files from [`data`][Self::data],
222 /// perform statistical analyses, create visualizations, and save the outputs
223 /// in [`results`][Self::results].
224 /// Generally, this directory contains a series of subdirectories, each of
225 /// which contain a set of related scripts for a single research question.
226 /// For example, each subdirectory may contain:
227 /// - a workflow file (e.g. [`Snakefile`](https://snakemake.github.io/) or [`Nextflow`](https://www.nextflow.io/))
228 /// - a data pre-processing script
229 /// - a statistical analysis and calculation script
230 /// - a plotting script
231 pub(crate) code: Option<PathBuf>,
232
233 /// Top-level path for data.
234 ///
235 /// All raw and processed data should be found within this directory.
236 /// The datasets located here will then be sourced by code within the [`code`][Self::code]
237 /// directory and used to generate outputs in the [`results`][Self::results]
238 /// directory.
239 pub(crate) data: Option<PathBuf>,
240
241 /// Top-level path for results.
242 ///
243 /// Code and scripts in the [`code`][Self::code] directory should create outputs
244 /// in this directory to ensure a separation of inputs, code, and outputs.
245 /// Generally, this directory contains a series of subdirectories, each of
246 /// which will contain all the outputs originating from the code in the
247 /// corresponding [`code`][Self::code] subdirectory.
248 pub(crate) results: Option<PathBuf>,
249
250 /// Top-level path for vendored external code.
251 ///
252 /// In many research projects you will need a copy of some external code,
253 /// such as a [git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules),
254 /// or a package manifest of some kind, like a [Nix derivation](https://nixos.org/)
255 /// or an [Anaconda recipe](https://anaconda.org/).
256 /// This folder can be used to house all of the relevant code so as to not
257 /// populate the custom code present in [`code`][Self::code].
258 pub(crate) external: Option<PathBuf>,
259
260 /// Top-level path for documentation.
261 ///
262 /// Documentation such as a manuscript, experimental descriptions from contract
263 /// research organizations, or interactive HTML notebooks displaying the data
264 /// and its results can go here.
265 /// Data that belongs in [`data`][Self::data] or notebooks which process data
266 /// that belong in [`code`][Self::code] should not be placed in here.
267 pub(crate) docs: Option<PathBuf>,
268}
269
270impl Default for RootDirs {
271 fn default() -> Self {
272 let root = project_root();
273
274 Self {
275 code: Some(root.join("code")),
276 data: Some(root.join("data")),
277 results: Some(root.join("results")),
278 external: Some(root.join("pkgs")),
279 docs: Some(root.join("docs")),
280 }
281 }
282}
283
284impl From<Option<ConfigFileRootDirs>> for RootDirs {
285 fn from(value: Option<ConfigFileRootDirs>) -> Self {
286 let defaults = Self::default();
287
288 match value {
289 Some(dirs) => {
290 let code = parse_root_dir_name(dirs.code.as_deref(), defaults.code);
291 let data = parse_root_dir_name(dirs.data.as_deref(), defaults.data);
292 let docs = parse_root_dir_name(dirs.docs.as_deref(), defaults.docs);
293 let external = parse_root_dir_name(dirs.external.as_deref(), defaults.external);
294 let results = parse_root_dir_name(dirs.results.as_deref(), defaults.results);
295
296 Self {
297 code,
298 data,
299 docs,
300 external,
301 results,
302 }
303 }
304 None => RootDirs::default(),
305 }
306 }
307}
308
309/// A helper function to parse the directory name for a root directory.
310///
311/// If the directory specified in the configuration file is an empty string,
312/// then it will be considered as `None`.
313fn parse_root_dir_name(s: Option<&str>, default: Option<PathBuf>) -> Option<PathBuf> {
314 match s {
315 Some("") => None,
316 Some(dir_name) => Some(PathBuf::from(dir_name)),
317 None => default,
318 }
319}