libscilo/config/
instantiated_config.rs

1//! Creating and using the internal representation of all the important files, directories, and lints to check.
2
3use regex::Regex;
4use std::path::Path;
5use std::str::FromStr;
6use std::{path::PathBuf, sync::LazyLock};
7use strum::IntoEnumIterator;
8
9use super::file::ConfigFileRootDirs;
10use super::{error::ConfigError, file::ConfigFile};
11use crate::config::parse::parse_config_file;
12use crate::lints::LintCheck;
13use crate::lints::LintError;
14use crate::version_control::project_root;
15
16/// The default regular expression for [`code`][RootDirs::code]
17/// and [`results`][RootDirs::results] folders to match against.
18/// The format is `YYYY-MM-DD_kebab-case-brief-description`.
19static CODE_RESULTS_SUBDIR_REGEX: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"2\d{3}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])_[A-Za-z-]+").unwrap());
21
22/// The default accepted file names of the READMEs that should be present in the
23/// project root and within each subdirectory in the [`code`][RootDirs::code]
24/// and [`data`][RootDirs::data] directories.
25static README_FILE_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| {
26    vec![
27        String::from("README.md"),
28        String::from("README.org"),
29        String::from("README.txt"),
30        String::from("README"),
31    ]
32});
33
34/// The default accepted file names of the workflow files that should be present
35/// within each subdirectory in the [`code`][RootDirs::code] directory.
36static WORKFLOW_FILE_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| {
37    vec![
38        // Snakemake
39        String::from("Snakefile"),
40        // Targets
41        String::from("_targets.R"),
42        // Cromwell, Common Workflow Language
43        String::from("main.cwl"),
44        // Nextflow
45        String::from("main.nf"),
46        // BioPipe
47        String::from("main.pipe"),
48        // Guix Workflow Language
49        String::from("main.w"),
50        // Workflow Description Language
51        String::from("main.wdl"),
52        // Make
53        String::from("Makefile"),
54    ]
55});
56
57/// The internal representation of all the important files, directories, and lints to check.
58///
59/// This data structure isn't created directly from a configuration file.
60/// It is instead instantiated from the [`ConfigFile`] struct, using some intermediate
61/// logic that can fail along the way.
62#[derive(Clone, Debug)]
63pub struct InstantiatedConfig {
64    /// Expected top-level directories to organize the project folder.
65    pub root_dirs: RootDirs,
66
67    /// Regular expression for units of analysis within the code and their matching results.
68    ///
69    /// The default value for this regular expression is [`CODE_RESULTS_SUBDIR_REGEX`].
70    pub code_results_subdir_regex: Option<Regex>,
71
72    /// The explicit list of lint checks that should be performed.
73    /// See [`LintCheck`] for the complete list.
74    pub lints: Vec<LintCheck>,
75
76    /// The explicit list of files that should be present at the root of the
77    /// project directory.
78    pub root_files: Vec<PathBuf>,
79
80    /// The file name for the READMEs that should be present within each
81    /// subdirectory in the [`code`][RootDirs::code] and [`data`][RootDirs::data]
82    /// directories.
83    pub readme_names: Option<Vec<String>>,
84
85    /// The file name for the workflow file that should be present within each
86    /// subdirectory in the [`code`][RootDirs::code] directory.
87    pub workflow_names: Option<Vec<String>>,
88}
89
90impl Default for InstantiatedConfig {
91    fn default() -> Self {
92        Self {
93            root_dirs: RootDirs::default(),
94            code_results_subdir_regex: Some((*CODE_RESULTS_SUBDIR_REGEX).clone()),
95            // by default, use all lints
96            lints: LintCheck::iter().collect(),
97            root_files: Vec::new(),
98            readme_names: Some((*README_FILE_NAMES).clone()),
99            workflow_names: Some((*WORKFLOW_FILE_NAMES).clone()),
100        }
101    }
102}
103
104impl TryFrom<&ConfigFile> for InstantiatedConfig {
105    type Error = ConfigError;
106
107    fn try_from(value: &ConfigFile) -> Result<Self, Self::Error> {
108        // use the defaults if a non-required option is specified in the ConfigFile
109        let defaults = Self::default();
110
111        let root_dirs = RootDirs::from(value.root_dirs.clone());
112
113        // this complicated iteration and mapping will automatically return a `ConfigError`
114        // if there is a typo or other misspecification in the configuration file.
115        let lints = match &value.lints {
116            Some(lint_list) => {
117                // map the string to the enum variant using the `strum` crate
118                // and map a `strum::ParseError` into a `ConfigError` if there is an
119                // error in the string conversion.
120                lint_list
121                    .iter()
122                    .map(|lint_code| {
123                        LintCheck::from_str(lint_code)
124                            .map_err(|_| ConfigError::UnknownLintCode(lint_code.clone()))
125                    })
126                    .collect::<Result<Vec<_>, ConfigError>>()?
127            }
128            None => defaults.lints,
129        };
130
131        let root_files = match &value.root_files {
132            Some(files) => files.iter().map(PathBuf::from).collect(),
133            None => defaults.root_files,
134        };
135
136        // this regex parsing might fail, which means that we need to use a
137        // `TryFrom` instead of a `From`
138        let code_results_subdir_regex = match &value.code_results_subdir_regex {
139            Some(str) => match Regex::new(str) {
140                Ok(re) => Some(re),
141                Err(_) => {
142                    return Err(ConfigError::RegexError {
143                        name: "code_results_subdir_regex".to_string(),
144                        value: str.clone(),
145                    });
146                }
147            },
148            None => defaults.code_results_subdir_regex,
149        };
150
151        let readme_names = match &value.readme_names {
152            Some(val) => Some(val.clone()),
153            None => defaults.readme_names.clone(),
154        };
155
156        let workflow_names = match &value.workflow_names {
157            Some(val) => Some(val.clone()),
158            None => defaults.workflow_names.clone(),
159        };
160
161        Ok(Self {
162            root_dirs,
163            code_results_subdir_regex,
164            lints,
165            root_files,
166            readme_names,
167            workflow_names,
168        })
169    }
170}
171
172impl TryFrom<ConfigFile> for InstantiatedConfig {
173    type Error = ConfigError;
174
175    fn try_from(value: ConfigFile) -> Result<Self, Self::Error> {
176        Self::try_from(&value)
177    }
178}
179
180impl TryFrom<&Path> for InstantiatedConfig {
181    type Error = ConfigError;
182
183    fn try_from(value: &Path) -> Result<Self, Self::Error> {
184        let cfg_file = parse_config_file(value)?;
185        InstantiatedConfig::try_from(cfg_file)
186    }
187}
188
189impl TryFrom<&PathBuf> for InstantiatedConfig {
190    type Error = ConfigError;
191
192    fn try_from(value: &PathBuf) -> Result<Self, Self::Error> {
193        let cfg_file = parse_config_file(value.as_path())?;
194        InstantiatedConfig::try_from(cfg_file)
195    }
196}
197
198impl TryFrom<PathBuf> for InstantiatedConfig {
199    type Error = ConfigError;
200
201    fn try_from(value: PathBuf) -> Result<Self, Self::Error> {
202        Self::try_from(&value)
203    }
204}
205
206impl InstantiatedConfig {
207    /// The main method to run all the requested lints on the project directory.
208    pub fn execute_lints(self) -> Result<(), LintError> {
209        for lint in self.lints.iter() {
210            lint.check(&self)?;
211        }
212        Ok(())
213    }
214}
215
216/// Expected top-level directories to organize the project folder.
217#[derive(Clone, Debug)]
218pub struct RootDirs {
219    /// Top-level path for code.
220    ///
221    /// This folder contains all the code used to process files from [`data`][Self::data],
222    /// perform statistical analyses, create visualizations, and save the outputs
223    /// in [`results`][Self::results].
224    /// Generally, this directory contains a series of subdirectories, each of
225    /// which contain a set of related scripts for a single research question.
226    /// For example, each subdirectory may contain:
227    /// - a workflow file (e.g. [`Snakefile`](https://snakemake.github.io/) or [`Nextflow`](https://www.nextflow.io/))
228    /// - a data pre-processing script
229    /// - a statistical analysis and calculation script
230    /// - a plotting script
231    pub(crate) code: Option<PathBuf>,
232
233    /// Top-level path for data.
234    ///
235    /// All raw and processed data should be found within this directory.
236    /// The datasets located here will then be sourced by code within the [`code`][Self::code]
237    /// directory and used to generate outputs in the [`results`][Self::results]
238    /// directory.
239    pub(crate) data: Option<PathBuf>,
240
241    /// Top-level path for results.
242    ///
243    /// Code and scripts in the [`code`][Self::code] directory should create outputs
244    /// in this directory to ensure a separation of inputs, code, and outputs.
245    /// Generally, this directory contains a series of subdirectories, each of
246    /// which will contain all the outputs originating from the code in the
247    /// corresponding [`code`][Self::code] subdirectory.
248    pub(crate) results: Option<PathBuf>,
249
250    /// Top-level path for vendored external code.
251    ///
252    /// In many research projects you will need a copy of some external code,
253    /// such as a [git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules),
254    /// or a package manifest of some kind, like a [Nix derivation](https://nixos.org/)
255    /// or an [Anaconda recipe](https://anaconda.org/).
256    /// This folder can be used to house all of the relevant code so as to not
257    /// populate the custom code present in [`code`][Self::code].
258    pub(crate) external: Option<PathBuf>,
259
260    /// Top-level path for documentation.
261    ///
262    /// Documentation such as a manuscript, experimental descriptions from contract
263    /// research organizations, or interactive HTML notebooks displaying the data
264    /// and its results can go here.
265    /// Data that belongs in [`data`][Self::data] or notebooks which process data
266    /// that belong in [`code`][Self::code] should not be placed in here.
267    pub(crate) docs: Option<PathBuf>,
268}
269
270impl Default for RootDirs {
271    fn default() -> Self {
272        let root = project_root();
273
274        Self {
275            code: Some(root.join("code")),
276            data: Some(root.join("data")),
277            results: Some(root.join("results")),
278            external: Some(root.join("pkgs")),
279            docs: Some(root.join("docs")),
280        }
281    }
282}
283
284impl From<Option<ConfigFileRootDirs>> for RootDirs {
285    fn from(value: Option<ConfigFileRootDirs>) -> Self {
286        let defaults = Self::default();
287
288        match value {
289            Some(dirs) => {
290                let code = parse_root_dir_name(dirs.code.as_deref(), defaults.code);
291                let data = parse_root_dir_name(dirs.data.as_deref(), defaults.data);
292                let docs = parse_root_dir_name(dirs.docs.as_deref(), defaults.docs);
293                let external = parse_root_dir_name(dirs.external.as_deref(), defaults.external);
294                let results = parse_root_dir_name(dirs.results.as_deref(), defaults.results);
295
296                Self {
297                    code,
298                    data,
299                    docs,
300                    external,
301                    results,
302                }
303            }
304            None => RootDirs::default(),
305        }
306    }
307}
308
309/// A helper function to parse the directory name for a root directory.
310///
311/// If the directory specified in the configuration file is an empty string,
312/// then it will be considered as `None`.
313fn parse_root_dir_name(s: Option<&str>, default: Option<PathBuf>) -> Option<PathBuf> {
314    match s {
315        Some("") => None,
316        Some(dir_name) => Some(PathBuf::from(dir_name)),
317        None => default,
318    }
319}