2025-05-26 18:43:00 +01:00
|
|
|
use crate::error::{ProcessError, Result};
|
2025-05-27 00:58:59 +01:00
|
|
|
use crate::file_type::FileType;
|
|
|
|
// use crate::processed_path::ProcessedPath;
|
|
|
|
use crate::FILES_TO_CONVERT;
|
2025-05-23 20:03:30 +01:00
|
|
|
use crate::tools;
|
2025-05-26 18:43:00 +01:00
|
|
|
use log::{debug, info, warn};
|
2025-05-27 00:58:59 +01:00
|
|
|
use std::{
|
|
|
|
collections::HashSet,
|
|
|
|
fs,
|
|
|
|
path::{Path, PathBuf},
|
|
|
|
time::SystemTime,
|
|
|
|
};
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// DirectoryProcessor handles the conversion of documents from a source directory
|
|
|
|
/// to a target directory, managing file conversions, copies, and cleanup.
|
|
|
|
#[derive(Debug)]
|
2025-05-23 19:22:44 +01:00
|
|
|
pub struct DirectoryProcessor {
|
2025-05-27 00:58:59 +01:00
|
|
|
pub source_dir: PathBuf,
|
|
|
|
pub target_dir: PathBuf,
|
|
|
|
pub source_files: HashSet<PathBuf>,
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl DirectoryProcessor {
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Creates a new DirectoryProcessor instance.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
/// * `source` - The source directory containing files to process
|
|
|
|
/// * `target` - The target directory where processed files will be placed
|
2025-05-23 19:22:44 +01:00
|
|
|
pub fn new(source: PathBuf, target: PathBuf) -> Self {
|
|
|
|
Self {
|
|
|
|
source_dir: source,
|
|
|
|
target_dir: target,
|
|
|
|
source_files: HashSet::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
/// Determines if a file needs to be processed based on modification times.
|
|
|
|
pub fn needs_processing(source: &Path, target: &Path) -> bool {
|
|
|
|
if !target.exists() {
|
2025-05-23 19:22:44 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
let source_time = fs::metadata(source)
|
2025-05-23 19:22:44 +01:00
|
|
|
.and_then(|m| m.modified())
|
2025-05-27 00:58:59 +01:00
|
|
|
.unwrap_or_else(|_| SystemTime::now());
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
let target_time = fs::metadata(target)
|
2025-05-23 19:22:44 +01:00
|
|
|
.and_then(|m| m.modified())
|
2025-05-27 00:58:59 +01:00
|
|
|
.unwrap_or_else(|_| SystemTime::now());
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
source_time > target_time
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
|
2025-05-26 18:43:00 +01:00
|
|
|
/// Collects all source files that need processing.
|
2025-05-27 00:58:59 +01:00
|
|
|
fn collect_source_files(&mut self, dir: &Path) -> Result<()> {
|
|
|
|
for entry in fs::read_dir(dir).map_err(ProcessError::Io)?.flatten() {
|
|
|
|
let path = entry.path();
|
|
|
|
|
|
|
|
if path.is_dir() {
|
|
|
|
self.collect_source_files(&path)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let file_type = FileType::from_path(&path, &self.source_dir, &self.target_dir)?;
|
|
|
|
if file_type.should_process() {
|
|
|
|
if let Some(source) = file_type.source() {
|
|
|
|
if let Ok(relative) = source.strip_prefix(&self.source_dir) {
|
|
|
|
self.source_files.insert(relative.to_path_buf());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
/// Processes all files in the source directory.
|
|
|
|
fn process_directory(&self, dir: &Path) -> Result<()> {
|
|
|
|
for entry in fs::read_dir(dir).map_err(ProcessError::Io)?.flatten() {
|
2025-05-23 19:22:44 +01:00
|
|
|
let path = entry.path();
|
2025-05-27 00:58:59 +01:00
|
|
|
|
2025-05-23 19:22:44 +01:00
|
|
|
if path.is_dir() {
|
2025-05-27 00:58:59 +01:00
|
|
|
fs::create_dir_all(self.target_dir.join(path.strip_prefix(&self.source_dir)?))
|
|
|
|
.map_err(ProcessError::Io)?;
|
|
|
|
self.process_directory(&path)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let file_type = FileType::from_path(&path, &self.source_dir, &self.target_dir)?;
|
|
|
|
self.process_file(file_type)?;
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Processes a single file based on its type.
|
|
|
|
fn process_file(&self, file_type: FileType) -> Result<()> {
|
|
|
|
match file_type {
|
|
|
|
FileType::Convert { source, target } => {
|
|
|
|
if Self::needs_processing(&source, &target) {
|
|
|
|
if let Some(parent) = target.parent() {
|
|
|
|
fs::create_dir_all(parent).map_err(ProcessError::Io)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
2025-05-27 00:58:59 +01:00
|
|
|
tools::convert_file(&source, target.parent().unwrap_or(&self.target_dir))
|
|
|
|
.map_err(ProcessError::Processing)?;
|
|
|
|
|
|
|
|
info!(
|
|
|
|
"Converted: {} -> {}",
|
|
|
|
source.strip_prefix(&self.source_dir)?.display(),
|
|
|
|
target.strip_prefix(&self.target_dir)?.display()
|
|
|
|
);
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
}
|
2025-05-27 00:58:59 +01:00
|
|
|
FileType::Copy { source, target } => {
|
|
|
|
if Self::needs_processing(&source, &target) {
|
|
|
|
if let Some(parent) = target.parent() {
|
|
|
|
fs::create_dir_all(parent).map_err(ProcessError::Io)?;
|
|
|
|
}
|
|
|
|
fs::copy(&source, &target).map_err(ProcessError::Io)?;
|
|
|
|
|
|
|
|
info!(
|
|
|
|
"Copied: {} -> {}",
|
|
|
|
source.strip_prefix(&self.source_dir)?.display(),
|
|
|
|
target.strip_prefix(&self.target_dir)?.display()
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => {}
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
/// Cleans up the target directory by removing obsolete files.
|
|
|
|
fn clean_target_directory(&self, dir: &Path) -> Result<bool> {
|
2025-05-23 19:22:44 +01:00
|
|
|
let mut is_empty = true;
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
for entry in fs::read_dir(dir).map_err(ProcessError::Io)?.flatten() {
|
2025-05-23 19:22:44 +01:00
|
|
|
let path = entry.path();
|
2025-05-27 00:58:59 +01:00
|
|
|
let file_type = FileType::from_path(&path, &self.target_dir, &self.target_dir)?;
|
|
|
|
|
|
|
|
if file_type.should_ignore() {
|
|
|
|
is_empty = false;
|
|
|
|
continue;
|
2025-05-25 00:18:11 +01:00
|
|
|
}
|
|
|
|
|
2025-05-23 19:22:44 +01:00
|
|
|
if path.is_dir() {
|
2025-05-26 18:43:00 +01:00
|
|
|
match self.clean_target_directory(&path) {
|
|
|
|
Ok(subdir_empty) => {
|
|
|
|
if subdir_empty {
|
|
|
|
if let Err(e) = fs::remove_dir(&path) {
|
|
|
|
warn!("Could not remove empty directory {}: {}", path.display(), e);
|
|
|
|
} else {
|
|
|
|
debug!("Removed empty directory: {}", path.display());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
is_empty = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
warn!("Error cleaning directory {}: {}", path.display(), e);
|
|
|
|
is_empty = false;
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
}
|
2025-05-27 00:58:59 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let relative = path.strip_prefix(&self.target_dir)?;
|
|
|
|
if !self.should_file_exist(relative) {
|
|
|
|
if let Err(e) = fs::remove_file(&path) {
|
|
|
|
warn!("Could not remove file {}: {}", relative.display(), e);
|
2025-05-23 19:22:44 +01:00
|
|
|
} else {
|
2025-05-27 00:58:59 +01:00
|
|
|
debug!("Removed obsolete file: {}", relative.display());
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
2025-05-27 00:58:59 +01:00
|
|
|
} else {
|
|
|
|
is_empty = false;
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(is_empty)
|
|
|
|
}
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
/// Determines if a file in the target directory should exist.
|
2025-05-26 18:43:00 +01:00
|
|
|
fn should_file_exist(&self, rel_path: &Path) -> bool {
|
|
|
|
if let Some(ext) = rel_path.extension().and_then(|e| e.to_str()) {
|
|
|
|
if ext == "pdf" {
|
2025-05-27 00:58:59 +01:00
|
|
|
// Check if any corresponding source file exists
|
2025-05-26 18:43:00 +01:00
|
|
|
FILES_TO_CONVERT
|
|
|
|
.iter()
|
|
|
|
.any(|&ext| self.source_files.contains(&rel_path.with_extension(ext)))
|
|
|
|
} else {
|
|
|
|
self.source_files.contains(rel_path)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
false
|
2025-05-23 19:22:44 +01:00
|
|
|
}
|
2025-05-26 18:43:00 +01:00
|
|
|
}
|
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
/// Process the entire directory structure.
|
|
|
|
pub fn process(&mut self) -> Result<()> {
|
|
|
|
// Clone paths before mutable borrow to avoid borrowing conflicts
|
|
|
|
let source_dir = self.source_dir.to_owned();
|
|
|
|
let target_dir = self.target_dir.to_owned();
|
2025-05-23 19:22:44 +01:00
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
// Now we can use the mutable borrow for collect_source_files
|
|
|
|
self.collect_source_files(&source_dir)?;
|
2025-05-23 20:03:30 +01:00
|
|
|
|
2025-05-27 00:58:59 +01:00
|
|
|
// And use the cloned paths for the remaining operations
|
|
|
|
self.process_directory(&source_dir)?;
|
|
|
|
self.clean_target_directory(&target_dir)?;
|
2025-05-23 19:22:44 +01:00
|
|
|
Ok(())
|
|
|
|
}
|
2025-05-23 20:03:30 +01:00
|
|
|
}
|