2025-05-26 18:43:00 +01:00
|
|
|
use crate::error::{ProcessError, Result};
|
2025-05-23 20:03:30 +01:00
|
|
|
use crate::tools;
|
2025-05-25 00:19:03 +01:00
|
|
|
use crate::{FILES_TO_CONVERT, FILES_TO_COPY, PATHS_TO_IGNORE};
|
2025-05-26 18:43:00 +01:00
|
|
|
use log::{debug, info, warn};
|
2025-05-23 19:22:44 +01:00
|
|
|
use std::collections::HashSet;
|
|
|
|
use std::fs;
|
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
use std::time::UNIX_EPOCH;
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// DirectoryProcessor handles the conversion of documents from a source directory
|
|
|
|
/// to a target directory, managing file conversions, copies, and cleanup.
|
|
|
|
#[derive(Debug)]
|
2025-05-23 19:22:44 +01:00
|
|
|
pub struct DirectoryProcessor {
|
2025-05-26 18:43:00 +01:00
|
|
|
pub(crate) source_dir: PathBuf,
|
|
|
|
pub(crate) target_dir: PathBuf,
|
|
|
|
pub(crate) source_files: HashSet<PathBuf>,
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl DirectoryProcessor {
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Creates a new DirectoryProcessor instance.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
/// * `source` - The source directory containing files to process
|
|
|
|
/// * `target` - The target directory where processed files will be placed
|
2025-05-23 19:22:44 +01:00
|
|
|
pub fn new(source: PathBuf, target: PathBuf) -> Self {
|
|
|
|
Self {
|
|
|
|
source_dir: source,
|
|
|
|
target_dir: target,
|
|
|
|
source_files: HashSet::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Determines if a file needs to be copied or converted based on modification times.
|
|
|
|
pub(crate) fn needs_copy_or_conversion(source_path: &Path, dest_path: &Path) -> bool {
|
2025-05-23 19:22:44 +01:00
|
|
|
if !dest_path.exists() {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
let source_modified = fs::metadata(source_path)
|
|
|
|
.and_then(|m| m.modified())
|
|
|
|
.unwrap_or(UNIX_EPOCH);
|
|
|
|
|
|
|
|
let dest_modified = fs::metadata(dest_path)
|
|
|
|
.and_then(|m| m.modified())
|
|
|
|
.unwrap_or(UNIX_EPOCH);
|
|
|
|
|
|
|
|
source_modified > dest_modified
|
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Collects all source files that need processing.
|
|
|
|
fn get_source_files(&mut self, current_dir: &Path) -> Result<()> {
|
|
|
|
let entries = fs::read_dir(current_dir).map_err(ProcessError::Io)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
|
|
|
|
for entry in entries.flatten() {
|
|
|
|
let path = entry.path();
|
|
|
|
if path.is_dir() {
|
|
|
|
self.get_source_files(&path)?;
|
|
|
|
} else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
2025-05-26 18:43:00 +01:00
|
|
|
let ext = ext.to_lowercase();
|
|
|
|
if FILES_TO_CONVERT.contains(&ext.as_str()) || FILES_TO_COPY.contains(&ext.as_str()) {
|
2025-05-23 19:22:44 +01:00
|
|
|
if let Ok(rel_path) = path.strip_prefix(&self.source_dir) {
|
|
|
|
self.source_files.insert(rel_path.to_path_buf());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Cleans up the target directory by removing obsolete files and empty directories.
|
|
|
|
fn clean_target_directory(&self, current_dir: &Path) -> Result<bool> {
|
|
|
|
let entries = fs::read_dir(current_dir).map_err(ProcessError::Io)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
let mut is_empty = true;
|
|
|
|
|
|
|
|
for entry in entries.flatten() {
|
|
|
|
let path = entry.path();
|
2025-05-25 00:18:11 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
// Check if path should be ignored
|
2025-05-25 00:18:11 +01:00
|
|
|
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
2025-05-25 00:19:03 +01:00
|
|
|
if PATHS_TO_IGNORE.iter().any(|&ignore| name.contains(ignore)) {
|
2025-05-25 00:18:11 +01:00
|
|
|
is_empty = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-23 19:22:44 +01:00
|
|
|
if path.is_dir() {
|
2025-05-26 18:43:00 +01:00
|
|
|
match self.clean_target_directory(&path) {
|
|
|
|
Ok(subdir_empty) => {
|
|
|
|
if subdir_empty {
|
|
|
|
if let Err(e) = fs::remove_dir(&path) {
|
|
|
|
warn!("Could not remove empty directory {}: {}", path.display(), e);
|
|
|
|
} else {
|
|
|
|
debug!("Removed empty directory: {}", path.display());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
is_empty = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
warn!("Error cleaning directory {}: {}", path.display(), e);
|
|
|
|
is_empty = false;
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2025-05-26 18:43:00 +01:00
|
|
|
let rel_path = path.strip_prefix(&self.target_dir).map_err(ProcessError::StripPrefix)?;
|
|
|
|
let should_exist = self.should_file_exist(rel_path);
|
2025-05-23 19:22:44 +01:00
|
|
|
|
|
|
|
if !should_exist {
|
|
|
|
if let Err(e) = fs::remove_file(&path) {
|
2025-05-26 18:43:00 +01:00
|
|
|
warn!("Could not remove file {}: {}", path.display(), e);
|
2025-05-23 19:22:44 +01:00
|
|
|
} else {
|
2025-05-26 18:43:00 +01:00
|
|
|
debug!("Removed obsolete file: {}", path.display());
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
is_empty = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(is_empty)
|
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Determines if a file in the target directory should exist based on source files.
|
|
|
|
fn should_file_exist(&self, rel_path: &Path) -> bool {
|
|
|
|
if let Some(ext) = rel_path.extension().and_then(|e| e.to_str()) {
|
|
|
|
if ext == "pdf" {
|
|
|
|
// For PDF files, check if any corresponding source file exists
|
|
|
|
FILES_TO_CONVERT
|
|
|
|
.iter()
|
|
|
|
.any(|&ext| self.source_files.contains(&rel_path.with_extension(ext)))
|
|
|
|
} else {
|
|
|
|
// For other files, check if they exist in source
|
|
|
|
self.source_files.contains(rel_path)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
false
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
2025-05-26 18:43:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Processes a single directory, converting or copying files as needed.
|
|
|
|
fn process_directory(&self, current_source: &Path, current_target: &Path) -> Result<()> {
|
|
|
|
fs::create_dir_all(current_target).map_err(ProcessError::Io)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
let entries = fs::read_dir(current_source).map_err(ProcessError::Io)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
|
|
|
|
for entry in entries.flatten() {
|
|
|
|
let path = entry.path();
|
2025-05-23 20:03:30 +01:00
|
|
|
|
2025-05-23 19:22:44 +01:00
|
|
|
if path.is_dir() {
|
2025-05-23 20:03:30 +01:00
|
|
|
let relative_path = path
|
|
|
|
.strip_prefix(&self.source_dir)
|
2025-05-26 18:43:00 +01:00
|
|
|
.map_err(ProcessError::StripPrefix)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
let dest_subdir = self.target_dir.join(relative_path);
|
|
|
|
self.process_directory(&path, &dest_subdir)?;
|
2025-05-26 18:43:00 +01:00
|
|
|
} else {
|
|
|
|
self.process_file(&path, current_source, current_target)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Processes a single file, either converting it to PDF or copying it.
|
|
|
|
fn process_file(&self, path: &Path, current_source: &Path, current_target: &Path) -> Result<()> {
|
|
|
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
|
|
|
let relative_path = path
|
|
|
|
.strip_prefix(current_source)
|
|
|
|
.map_err(ProcessError::StripPrefix)?;
|
|
|
|
|
|
|
|
let ext = ext.to_lowercase();
|
|
|
|
if FILES_TO_CONVERT.contains(&ext.as_str()) {
|
|
|
|
self.convert_file(path, relative_path, current_target)?;
|
|
|
|
} else if FILES_TO_COPY.contains(&ext.as_str()) {
|
|
|
|
self.copy_file(path, relative_path, current_target)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 20:03:30 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Converts a file to PDF format.
|
|
|
|
fn convert_file(&self, path: &Path, relative_path: &Path, current_target: &Path) -> Result<()> {
|
|
|
|
let pdf_path = current_target.join(relative_path.with_extension("pdf"));
|
|
|
|
|
|
|
|
if Self::needs_copy_or_conversion(path, &pdf_path) {
|
|
|
|
tools::convert_file(path, pdf_path.parent().unwrap_or(current_target))
|
|
|
|
.map_err(|e| ProcessError::Processing(e))?;
|
|
|
|
|
|
|
|
info!(
|
|
|
|
"Converted: {} -> {}",
|
|
|
|
path.strip_prefix(&self.source_dir).unwrap().display(),
|
|
|
|
pdf_path.strip_prefix(&self.target_dir).unwrap().display()
|
|
|
|
);
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Copies a file to the target directory.
|
|
|
|
fn copy_file(&self, path: &Path, relative_path: &Path, current_target: &Path) -> Result<()> {
|
|
|
|
let dest_path = current_target.join(relative_path);
|
|
|
|
if Self::needs_copy_or_conversion(path, &dest_path) {
|
|
|
|
fs::copy(path, &dest_path).map_err(ProcessError::Io)?;
|
|
|
|
info!(
|
|
|
|
"Copied file: {} -> {}",
|
|
|
|
path.strip_prefix(&self.source_dir).unwrap().display(),
|
|
|
|
dest_path.strip_prefix(&self.target_dir).unwrap().display()
|
|
|
|
);
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 20:03:30 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Processes all files in the source directory, converting or copying them as needed,
|
|
|
|
/// and then cleans up the target directory.
|
|
|
|
pub fn process(&mut self) -> Result<()> {
|
|
|
|
debug!("Collecting source files");
|
|
|
|
self.get_source_files(&self.source_dir.to_owned())?;
|
|
|
|
|
|
|
|
debug!("Starting directory processing");
|
|
|
|
self.process_directory(&self.source_dir.to_owned(), &self.target_dir.to_owned())?;
|
|
|
|
|
|
|
|
debug!("Cleaning target directory");
|
|
|
|
self.clean_target_directory(&self.target_dir.to_owned())?;
|
|
|
|
|
|
|
|
info!("Directory processing completed successfully");
|
2025-05-23 19:22:44 +01:00
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 20:03:30 +01:00
|
|
|
}
|