第一步添加总数分类

第二步添加“生成物理地址文件”的检测重复文件名
This commit is contained in:
CXT-maker 2025-11-17 20:29:11 +08:00
parent a447e791e0
commit df39df55d7
8 changed files with 297 additions and 83 deletions

View File

@ -72,7 +72,7 @@ dependencies {
implementation 'com.github.albfernandez:javadbf:1.14.1'
implementation 'org.apache.poi:poi-ooxml:5.4.1'
implementation 'com.intellij:annotations:12.0'
implementation 'org.ofdrw:ofdrw-reader:2.3.7'
compileOnly 'org.projectlombok:lombok:1.18.38'
annotationProcessor 'org.projectlombok:lombok:1.18.38'

View File

@ -12,7 +12,7 @@ public class JavaFxApplication extends Application {
@Override
public void init() throws Exception {
super.init();
System.setVersion("1.0.0.5");
System.setVersion("1.0");
}
@Override

View File

@ -31,6 +31,17 @@ public class DuplicateFinder {
private final FileHashCalculator hashCalculator;
private final boolean enableProgress;
private final ExecutorService executorService;
private final AtomicInteger targetFiles = new AtomicInteger(0);
private final AtomicInteger otherFiles = new AtomicInteger(0);
// Add getter methods
public int getTargetFilesCount() {
return targetFiles.get();
}
public int getOtherFilesCount() {
return otherFiles.get();
}
// 进度回调接口
public interface ProgressCallback {
default void onPhaseStarted(Phase phase) {}
@ -71,6 +82,8 @@ public class DuplicateFinder {
public List<DuplicateGroup> findDuplicates(Path rootDir) throws IOException {
// 清理错误列表
errors.clear();
targetFiles.set(0);
otherFiles.set(0);
// -----------------------------
// 第一阶段按文件大小分组
// -----------------------------
@ -154,6 +167,25 @@ public class DuplicateFinder {
meta.setPath(file);
meta.setSize(Files.size(file));
sizeGroups.computeIfAbsent(meta.getSize(), k -> new ArrayList<>()).add(meta);
// Classify and count files
String fileName = file.getFileName().toString().toLowerCase();
String extension = "";
int lastDotIndex = fileName.lastIndexOf('.');
if (lastDotIndex > 0) {
extension = fileName.substring(lastDotIndex + 1);
}
// Check if it's a target file
if ("pdf".equals(extension) || "ofd".equals(extension) ||
"jpg".equals(extension) || "jpeg".equals(extension) ||
"png".equals(extension) || "bmp".equals(extension) ||
"gif".equals(extension) || "tiff".equals(extension) ||
"jp2".equals(extension)) {
targetFiles.incrementAndGet();
} else {
otherFiles.incrementAndGet();
}
} catch (IOException e) {
log.error(LoggerMarker.TRACE_MARKER, "Failed to get file's size: {}", file);
}

View File

@ -98,6 +98,7 @@ public class LogicalAddressFileGenerator implements AddressFileGenerator {
errorMsg.append(code).append("\n");
}
log.info(LoggerMarker.RELEASE_MARKER, "{}", errorMsg.toString());
throw new RuntimeException(errorMsg.toString());
}else {
// 写入CSV头部
@ -132,6 +133,26 @@ public class LogicalAddressFileGenerator implements AddressFileGenerator {
* 生成文件级逻辑地址文件
*/
private void generateFileLevelFile(PrintWriter writer, List<Record> records, ProgressCallback callback) {
Set<String> seenCodes = new HashSet<>();
Set<String> duplicateCodes = new HashSet<>();
// 收集所有重复的档号
for (Record record : records) {
String archiveCode = record.archiveCode;
if (!seenCodes.add(archiveCode)) {
duplicateCodes.add(archiveCode);
}
}
// 如果存在重复档号记录日志并抛出异常
if (!duplicateCodes.isEmpty()) {
StringBuilder errorMsg = new StringBuilder("存在重复档号:\n");
for (String code : duplicateCodes) {
errorMsg.append(code).append("\n");
}
log.info(LoggerMarker.RELEASE_MARKER, "{}", errorMsg.toString());
throw new RuntimeException(errorMsg.toString());
} else {
// 写入CSV头部包含页数列
writer.println("逻辑文件名,逻辑地址,页数");
int totalRecords = records.stream().mapToInt(r -> r.page).sum();
@ -150,6 +171,7 @@ public class LogicalAddressFileGenerator implements AddressFileGenerator {
safeOnPhaseProgress(Phase.GENERATE_LOGICAL, current, totalRecords);
}
}
}
/**
* 根据档号生成文件级逻辑地址不包含页数

View File

@ -3,11 +3,14 @@ package top.r3944realms.docchecktoolrefactored.core;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.ofdrw.reader.OFDReader;
import top.r3944realms.docchecktoolrefactored.util.LoggerMarker;
import java.io.File;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
@Slf4j
public class PhysicalAddressFileGenerator implements AddressFileGenerator {
@ -55,10 +58,8 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
try (PrintWriter writer = new PrintWriter(outputFile, StandardCharsets.UTF_8)) {
if (folderType == PAGE_TYPE) {
writer.println("物理文件名,物理地址");
processPageLevelFolder(rootFolder, writer, outputFile.getAbsolutePath(), callback, counter, totalFiles);
} else if (folderType == FILE_TYPE) {
writer.println("物理文件名,物理地址,页数");
processFileLevelFolder(rootFolder, writer, outputFile.getAbsolutePath(), callback, counter, totalFiles);
} else {
throw new IllegalArgumentException("不支持的文件夹类型: " + folderType);
@ -66,7 +67,17 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
}
safeOnPhaseCompleted(Phase.GENERATE_PHYSICAL);
} catch (Exception e) {
} catch (RuntimeException e) {
// 直接重新抛出自定义的重复文件名异常不让它被日志吞掉
if (e.getMessage() != null && e.getMessage().startsWith("存在重复文件名:")) {
throw e;
}
safeOnPhaseCompleted(Phase.GENERATE_PHYSICAL);
safeOnPhaseProgress(Phase.GENERATE_PHYSICAL, 0, 0);
log.error("生成物理地址文件失败: {}", e.getMessage(), e);
throw e; // 重新抛出异常确保能被上层捕获
}
catch (Exception e) {
safeOnPhaseCompleted(Phase.GENERATE_PHYSICAL);
safeOnPhaseProgress(Phase.GENERATE_PHYSICAL, 0, 0);
log.error("生成物理地址文件失败: {}", e.getMessage(), e);
@ -90,6 +101,7 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
}
return count;
}
/**
* 处理页面级文件夹及其内部文件
*
@ -98,6 +110,61 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
* @param outputFilePath 输出文件的绝对路径
*/
private void processPageLevelFolder(File folder, PrintWriter writer, String outputFilePath, ProgressCallback callback, int[] counter, int total) {
Set<String> seenFileNames = new HashSet<>();
Set<String> duplicateFileNames = new HashSet<>();
// 收集所有重复的文件名
collectPageLevelFileNames(folder, outputFilePath, seenFileNames, duplicateFileNames);
// 如果存在重复文件名记录日志并抛出异常
if (!duplicateFileNames.isEmpty()) {
StringBuilder errorMsg = new StringBuilder("存在重复文件名:\n");
for (String name : duplicateFileNames) {
errorMsg.append(name).append("\n");
}
log.info(LoggerMarker.RELEASE_MARKER, "{}", errorMsg.toString());
throw new RuntimeException(errorMsg.toString());
} else {
// 写入CSV头部
writer.println("物理文件名,物理地址");
// 处理文件与原来的方法类似但排除了重复检查
processPageLevelFolderInternal(folder, writer, outputFilePath, callback, counter, total);
}
}
/**
* 递归收集页面级文件夹中的所有文件名以检测重复
*/
private void collectPageLevelFileNames(File folder, String outputFilePath,
Set<String> seenFileNames, Set<String> duplicateFileNames) {
File[] filesAndFolders = folder.listFiles(file -> !file.isHidden());
if (filesAndFolders != null) {
for (File file : filesAndFolders) {
// 跳过输出文件本身
if (file.getAbsolutePath().equals(outputFilePath)) continue;
if (file.isFile() && isImageFile(file.getName())) {
// 移除文件扩展名
String fileName = file.getName().substring(0, file.getName().lastIndexOf('.'));
// 检查是否重复
if (!seenFileNames.add(fileName)) {
duplicateFileNames.add(fileName);
}
} else if (file.isDirectory()) {
// 递归处理子文件夹
collectPageLevelFileNames(file, outputFilePath, seenFileNames, duplicateFileNames);
}
}
}
}
/**
* 实际处理页面级文件夹的内部实现无重复检查
*/
private void processPageLevelFolderInternal(File folder, PrintWriter writer, String outputFilePath,
ProgressCallback callback, int[] counter, int total) {
// 获取该文件夹下的所有非隐藏文件和文件夹
File[] filesAndFolders = folder.listFiles(file -> !file.isHidden());
@ -108,7 +175,6 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
// 只处理图片文件跳过其他类型的文件
if (file.isFile() && isImageFile(file.getName())) {
// 移除文件扩展名
String fileNameWithoutExt = file.getName().substring(0, file.getName().lastIndexOf('.'));
@ -125,12 +191,22 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
safeOnPhaseProgress(Phase.GENERATE_PHYSICAL, counter[0], total);
} else if (file.isDirectory()) {
// 递归处理子文件夹
processPageLevelFolder(file, writer, outputFilePath, callback, counter, total);
processPageLevelFolderInternal(file, writer, outputFilePath, callback, counter, total);
}
}
}
}
/**
* 处理文件级文件夹处理PDF文件
*
@ -139,6 +215,60 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
* @param outputFilePath 输出文件的绝对路径
*/
private void processFileLevelFolder(File folder, PrintWriter writer, String outputFilePath, ProgressCallback callback, int[] counter, int total) {
Set<String> seenFileNames = new HashSet<>();
Set<String> duplicateFileNames = new HashSet<>();
// 收集所有重复的文件名
collectFileLevelFileNames(folder, outputFilePath, seenFileNames, duplicateFileNames);
// 如果存在重复文件名记录日志并抛出异常
if (!duplicateFileNames.isEmpty()) {
StringBuilder errorMsg = new StringBuilder("存在重复文件名:\n");
for (String name : duplicateFileNames) {
errorMsg.append(name).append("\n");
}
log.info(LoggerMarker.RELEASE_MARKER, "{}", errorMsg.toString());
throw new RuntimeException(errorMsg.toString());
} else {
// 写入CSV头部
writer.println("物理文件名,物理地址,页数");
// 处理文件与原来的方法类似但排除了重复检查
processFileLevelFolderInternal(folder, writer, outputFilePath, callback, counter, total);
}
}
/**
* 递归收集文件级文件夹中的所有文件名以检测重复
*/
private void collectFileLevelFileNames(File folder, String outputFilePath,
Set<String> seenFileNames, Set<String> duplicateFileNames) {
File[] filesAndFolders = folder.listFiles(file -> !file.isHidden());
if (filesAndFolders != null) {
for (File file : filesAndFolders) {
// 跳过输出文件本身
if (file.getAbsolutePath().equals(outputFilePath)) continue;
if (file.isFile() && isPdfFile(file.getName())) {
String fileName = file.getName().substring(0, file.getName().lastIndexOf('.'));
// 检查是否重复
if (!seenFileNames.add(fileName)) {
duplicateFileNames.add(fileName);
}
} else if (file.isDirectory()) {
// 递归处理子文件夹
collectFileLevelFileNames(file, outputFilePath, seenFileNames, duplicateFileNames);
}
}
}
}
/**
* 实际处理文件级文件夹的内部实现无重复检查
*/
private void processFileLevelFolderInternal(File folder, PrintWriter writer, String outputFilePath,
ProgressCallback callback, int[] counter, int total) {
// 获取该文件夹下的所有非隐藏文件和文件夹
File[] filesAndFolders = folder.listFiles(file -> !file.isHidden());
@ -166,31 +296,41 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
safeOnPhaseProgress(Phase.GENERATE_PHYSICAL, counter[0], total);
} else if (file.isDirectory()) {
// 递归处理子文件夹
processFileLevelFolder(file, writer, outputFilePath, callback, counter, total);
processFileLevelFolderInternal(file, writer, outputFilePath, callback, counter, total);
}
}
}
}
/**
* 获取PDF文件的页数
* 获取PDF或OFD文件的页数
*
* @param pdfFile PDF文件
* @param file 文件
* @return 页数
*/
private int getPdfPageCount(File pdfFile) {
private int getPdfPageCount(File file) {
try {
if (file.getName().toLowerCase().endsWith(".pdf")) {
// 使用Apache PDFBox库获取PDF页数
try (PDDocument document = Loader.loadPDF(pdfFile)){
try (PDDocument document = Loader.loadPDF(file)) {
int pageCount = document.getNumberOfPages();
document.close();
return pageCount;
}
} else if (file.getName().toLowerCase().endsWith(".ofd")) {
// 使用OFDRW库获取OFD页数
try (org.ofdrw.reader.OFDReader reader = new org.ofdrw.reader.OFDReader(file.toPath())) {
// 获取OFD文档的页面数量
int pageCount = reader.getNumberOfPages();
return pageCount;
}
}
} catch (Exception e) {
log.warn(LoggerMarker.RELEASE_MARKER, "无法获取PDF文件页数: {}", pdfFile.getAbsolutePath(), e);
log.warn(LoggerMarker.RELEASE_MARKER, "无法获取文件页数: {}", file.getAbsolutePath(), e);
return 0;
}
return 0;
}
/**
* 判断是否为PDF文件
*
@ -198,7 +338,7 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
* @return 是否为PDF文件
*/
private boolean isPdfFile(String fileName) {
return fileName.toLowerCase().endsWith(".pdf");
return fileName.toLowerCase().endsWith(".pdf")|| fileName.toLowerCase().endsWith(".ofd");
}
/**
@ -211,7 +351,8 @@ public class PhysicalAddressFileGenerator implements AddressFileGenerator {
String lowerFileName = fileName.toLowerCase();
return lowerFileName.endsWith(".jpg") || lowerFileName.endsWith(".jpeg") ||
lowerFileName.endsWith(".png") || lowerFileName.endsWith(".bmp") ||
lowerFileName.endsWith(".gif") || lowerFileName.endsWith(".tiff");
lowerFileName.endsWith(".gif") || lowerFileName.endsWith(".tiff")||
lowerFileName.endsWith(".jp2"); // 支持 JPEG 2000 格式
}
/**

View File

@ -179,7 +179,7 @@ public class PathCheckPaneController implements Initializable {
// 当任务完成时显示完整结果
task.setOnSucceeded(e -> {
progressBar.closeProgress();
result2TA.setText(task.getValue());
result2TA.setText("生成逻辑路径 csv 文件任务完成输出csv文件路径"+task.getValue());
generateLogicalAddress2B.setDisable(false);
addResultSuccessfulStyle();
log.info(LoggerMarker.RELEASE_MARKER, "生成逻辑路径 csv 文件任务完成输出csv文件路径{}", task.getValue());
@ -296,7 +296,7 @@ public class PathCheckPaneController implements Initializable {
// 当任务完成时显示完整结果
task.setOnSucceeded(e -> {
progressBar.closeProgress();
result2TA.setText(task.getValue());
result2TA.setText("生成物理路径 csv 文件任务完成输出csv文件路径"+task.getValue());
generatePhysicalAddress2B.setDisable(false);
log.info(LoggerMarker.RELEASE_MARKER, "生成物理路径 csv 文件任务完成输出csv文件路径{}", task.getValue());
addResultSuccessfulStyle();

View File

@ -75,6 +75,13 @@ public class AddressFileGenerationTask extends Task<String> {
if (e instanceof RuntimeException &&
e.getMessage() != null &&
e.getMessage().startsWith("存在重复档号:")) {
outputFile.delete(); // 删除空内容的csv文件
throw e; // 直接抛出保留原始消息
}
if (e instanceof RuntimeException &&
e.getMessage() != null &&
e.getMessage().startsWith("存在重复文件名:")) {
outputFile.delete();// 删除空内容的csv文件
throw e; // 直接抛出保留原始消息
}
throw new RuntimeException("地址文件生成失败", e);

View File

@ -25,6 +25,7 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
private final String folderPath;
private final DuplicateFinder duplicateFinder;
public DuplicateDocumentDetectionTask(String folderPath) {
this.folderPath = folderPath;
// 创建带进度更新的扫描器
@ -48,6 +49,7 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
// 用于统计文件总数
AtomicInteger totalFiles = new AtomicInteger(0);
// 使用 RobustParallelScanner MD5HashCalculator 进行并行扫描和哈希计算
// 设置进度回调
duplicateFinder.setProgressCallback(new DuplicateFinder.ProgressCallback() {
@ -127,14 +129,19 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
List<DuplicateGroup> duplicateGroups = resultRef.get();
// 构建最终结果
return generateResult(duplicateGroups, totalFiles);
return generateResult(duplicateGroups, totalFiles,
duplicateFinder.getTargetFilesCount(),
duplicateFinder.getOtherFilesCount());
}
private static @NotNull String generateResult(List<DuplicateGroup> duplicateGroups, AtomicInteger totalFiles) {
// Update method signature
private static @NotNull String generateResult(List<DuplicateGroup> duplicateGroups,
AtomicInteger totalFiles,
int targetFilesCount,
int otherFilesCount) {
StringBuilder result = new StringBuilder();
// 计算总文件数所有组中的文件数
// Calculate total duplicate files
int totalDuplicateFiles = duplicateGroups.stream()
.mapToInt(group -> group.fileMetas().size())
.sum();
@ -142,9 +149,12 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
int totalGroups = duplicateGroups.size();
result.append(String.format("总共扫描文件数: %d\n", totalFiles.get()));
result.append(String.format("其中目标文件(jpg jpeg png bmg gif tiff jp2 pdf ofd )数: %d\t", targetFilesCount));
result.append(String.format(" 非目标文件数: %d\n", otherFilesCount));
result.append(String.format("发现重复文件组数: %d\n", totalGroups));
result.append(String.format("重复文件总数: %d\n", totalDuplicateFiles));
// Rest of the existing implementation remains the same
if (!duplicateGroups.isEmpty()) {
result.append("\n详细重复文件信息:\n");
result.append("----------------------------------------\n");
@ -171,6 +181,7 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
return result.toString();
}
@Override
protected void cancelled() {
super.cancelled();
@ -178,4 +189,5 @@ public class DuplicateDocumentDetectionTask extends Task<String>{
duplicateFinder.shutdown();
updateMessage("操作已被取消");
}
}