feat:文件扫描功能、hash码计算与重复文件扫描的类实现

This commit is contained in:
叁玖领域 2025-07-14 15:29:48 +08:00
parent e764df736a
commit 5e2fbfe1f9
15 changed files with 615 additions and 34 deletions

View File

@ -1,3 +1,9 @@
buildscript {
repositories {
google()
mavenCentral()
}
}
plugins {
id 'java'
id 'io.franzbecker.gradle-lombok' version '3.0.0'
@ -56,12 +62,17 @@ dependencies {
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:${junitVersion}")
}
test {
useJUnitPlatform()
configurations.configureEach {
exclude group: 'org.apache.logging.log4j', module: 'log4j-slf4j-impl'
}
}
// testJar
tasks.register('testJar', Jar) {
from sourceSets.test.output
}
tasks.register('createLogDir') {
doLast {

View File

@ -8,11 +8,15 @@ module top.r3944realms.docchecktoolrefactored {
opens top.r3944realms.docchecktoolrefactored to javafx.fxml;
opens top.r3944realms.docchecktoolrefactored.ui to javafx.fxml;
opens top.r3944realms.docchecktoolrefactored.ui.module to javafx.fxml;
opens top.r3944realms.docchecktoolrefactored.deprecated to javafx.fxml;
exports top.r3944realms.docchecktoolrefactored to javafx.graphics;
exports top.r3944realms.docchecktoolrefactored.ui to javafx.fxml;
exports top.r3944realms.docchecktoolrefactored.ui.module to javafx.fxml;
exports top.r3944realms.docchecktoolrefactored.deprecated to javafx.graphics;
opens top.r3944realms.docchecktoolrefactored.deprecated to javafx.fxml;
exports top.r3944realms.docchecktoolrefactored.core ;
exports top.r3944realms.docchecktoolrefactored.io.scanner;
exports top.r3944realms.docchecktoolrefactored.io.reader;
exports top.r3944realms.docchecktoolrefactored.model;
}

View File

@ -4,6 +4,7 @@ import javafx.application.Application;
import javafx.stage.Stage;
import lombok.extern.slf4j.Slf4j;
import top.r3944realms.docchecktoolrefactored.ui.SceneManager;
import top.r3944realms.docchecktoolrefactored.util.StringUtil;
/**
* The type Main.
@ -28,7 +29,7 @@ public class Main extends Application {
* @param args the input arguments
*/
public static void main(String[] args) {
log.info("Hello World!");
log.info(StringUtil.NO_BUG);
launch(args);
}
}

View File

@ -0,0 +1,104 @@
package top.r3944realms.docchecktoolrefactored.core;
import lombok.extern.slf4j.Slf4j;
import top.r3944realms.docchecktoolrefactored.io.scanner.FileScanner;
import top.r3944realms.docchecktoolrefactored.model.DuplicateGroup;
import top.r3944realms.docchecktoolrefactored.model.FileMetadata;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
/**
* 重复文件查找核心类
*/
@Slf4j
public class DuplicateFinder {
private final FileScanner fileScanner;
private final FileHashCalculator hashCalculator;
private final boolean enableProgress;
public DuplicateFinder(FileScanner fileScanner, FileHashCalculator hashCalculator, boolean enableProgress) {
this.fileScanner = Objects.requireNonNull(fileScanner);
this.hashCalculator = Objects.requireNonNull(hashCalculator);
this.enableProgress = enableProgress;
}
public DuplicateFinder(FileScanner fileScanner, FileHashCalculator hashCalculator) {
this(fileScanner, hashCalculator, false);
}
/**
* 查找重复文件
* @param rootDir 要扫描的根目录
* @return 按哈希值分组的重复文件列表
*/
public List<DuplicateGroup> findDuplicates(Path rootDir) throws IOException {
// 第一阶段按文件大小分组
Map<Long, List<FileMetadata>> sizeGroups = groupFilesBySize(rootDir);
// 第二阶段对可能重复的文件计算哈希
Map<String, List<FileMetadata>> hashGroups = new ConcurrentHashMap<>();
sizeGroups.values().parallelStream()
.filter(group -> group.size() > 1) // 只处理可能重复的文件
.forEach(group -> group.parallelStream().forEach(file -> {
try {
String hash = hashCalculator.calculateHash(file.getPath());
file.setHash(hash);
hashGroups.computeIfAbsent(hash, k -> new ArrayList<>()).add(file);
} catch (IOException e) {
// 记录错误但继续处理其他文件
log.error("Failed to calculate file's hash: {}, {}", file.getPath(), e.getMessage());
}
}));
// 第三阶段构建结果
return hashGroups.values().stream()
.filter(group -> group.size() > 1)
.map(group -> new DuplicateGroup(
group.get(0).getHash(),
group.get(0).getSize(),
group
))
.sorted(Comparator.comparingLong(DuplicateGroup::size).reversed())
.collect(Collectors.toList());
}
/**
* 按文件大小分组
*/
private Map<Long, List<FileMetadata>> groupFilesBySize(Path rootDir) throws IOException {
Map<Long, List<FileMetadata>> sizeGroups = new ConcurrentHashMap<>();
FileScanner.ProgressAwareListener listener = new FileScanner.ProgressAwareListener() {
@Override
public void onProgressUpdate(int current, int total) {
log.info("Scanning progress: {}/{} ", current, total);
}
@Override
public void onFileFound(Path file) {
try {
FileMetadata meta = new FileMetadata();
meta.setPath(file);
meta.setSize(Files.size(file));
sizeGroups.computeIfAbsent(meta.getSize(), k -> new ArrayList<>()).add(meta);
} catch (IOException e) {
log.error("Failed to get file's size: {}", file);
}
}
@Override public void onScanComplete() {}
@Override public void onError(Path file, Exception e) {
log.error("Error on scanning file: {}, {}", file, e.getMessage());
}
};
if(enableProgress)
fileScanner.scanWithProgress(rootDir, listener);
else
fileScanner.scan(rootDir, listener);
return sizeGroups;
}
}

View File

@ -0,0 +1,23 @@
package top.r3944realms.docchecktoolrefactored.core;
import java.io.IOException;
import java.nio.file.Path;
/**
* 文件哈希计算策略接口
*/
public interface FileHashCalculator {
/**
* 计算文件哈希值
* @param file 要计算的文件路径
* @return 文件的哈希值字符串
*/
String calculateHash(Path file) throws IOException;
/**
* 默认实现使用MD5
*/
static FileHashCalculator defaultInstance() {
return new MD5HashCalculator();
}
}

View File

@ -0,0 +1,39 @@
package top.r3944realms.docchecktoolrefactored.core;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
* MD5哈希计算实现
*/
public class MD5HashCalculator implements FileHashCalculator {
private static final int BUFFER_SIZE = 8192;
@Override
public String calculateHash(Path file) throws IOException {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
try (var is = Files.newInputStream(file)) {
byte[] buffer = new byte[BUFFER_SIZE];
int bytesRead;
while ((bytesRead = is.read(buffer)) != -1) {
md.update(buffer, 0, bytesRead);
}
}
return bytesToHex(md.digest());
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("MD5算法不可用", e);
}
}
private static String bytesToHex(byte[] bytes) {
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(String.format("%02x", b));
}
return sb.toString();
}
}

View File

@ -12,7 +12,15 @@ public interface FileScanner {
* @param rootPath 根路径
* @param listener 文件发现监听器
*/
void scan(Path rootPath, FileScanListener listener);
default void scan(Path rootPath, FileScanListener listener) {
throw new UnsupportedOperationException("Please implement FileScanner, FileScannerListener.");
}
/**
* 扫描指定路径下的文件带进度反馈
*/
default void scanWithProgress(Path rootPath, ProgressAwareListener listener) {
throw new UnsupportedOperationException("Please implement FileScanner, ProgressAwareListener.");
}
/**
* 文件扫描监听器
@ -38,4 +46,14 @@ public interface FileScanner {
*/
void onError(Path file, Exception e);
}
interface ProgressAwareListener extends FileScanListener {
/**
* 进度更新回调
* @param current 当前已处理文件数
* @param total 预估总文件数可能动态增长
*/
void onProgressUpdate(int current, int total);
}
}

View File

@ -1,18 +1,35 @@
package top.r3944realms.docchecktoolrefactored.io.scanner;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* The type Parallel file scanner.
* <p>
* 这个没法正常使用目前遇到的问题
* <p>
* * 目录遍历时遇到权限问题静默失败
* <p>
* * 存在符号链接循环
* <p>
* * 文件系统驱动程序卡死
* <p>
* * JVM与NTFS文件系统兼容性问题
*/
public class ParallelFileScanner implements FileScanner,AutoCloseable {
@Slf4j
@Deprecated
public class ParallelFileScanner implements FileScanner ,AutoCloseable {
private final ForkJoinPool forkJoinPool;
private volatile boolean cancelled = false;
/**
* 使用默认并行度CPU核心数
*/
@ -31,38 +48,90 @@ public class ParallelFileScanner implements FileScanner,AutoCloseable {
@Override
public void scan(Path rootPath, FileScanListener listener) {
forkJoinPool.submit(() -> {
try (
Stream<Path> pathStream = Files.walk(rootPath)
.parallel() // 使用ForkJoinPool的并行流
.filter(Files::isRegularFile)
){
pathStream.forEach(file -> {
try {
listener.onFileFound(file);
} catch (Exception e) {
listener.onError(file, e);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}
).join();
scanInternal(rootPath, listener, null);
}
@Override
public void scanWithProgress(Path rootPath, ProgressAwareListener listener) {
// 先快速统计总文件数
long totalFiles = countFiles(rootPath);
scanInternal(rootPath, listener, totalFiles);
}
listener.onScanComplete();
private long countFiles(Path rootPath) {
try(Stream<Path> pathStream = Files.walk(rootPath)
.parallel()
.filter(Files::isRegularFile)) {
return pathStream.count();
} catch (IOException e) {
return -1; // 表示无法确定总数
}
}
private void scanInternal(Path rootPath, FileScanListener listener, Long totalFiles) {
log.debug("ThreadPool Status: {}", forkJoinPool.isShutdown() ? "Closed" : "Running");
forkJoinPool.submit(() -> { // 方法没问题可能就是在线程这里被卡死了
try {
AtomicInteger processed = new AtomicInteger(0);
log.debug("Scanning files in {}", rootPath);
// 收集所有文件到List避免Stream被重复使用
@SuppressWarnings("resource") List<Path> files = Files.walk(rootPath)
.peek(p -> log.trace("visiting: {}", p))
.parallel()
.filter(p -> {
boolean isRegular = Files.isRegularFile(p);
if (!isRegular) {
log.debug("Skip non-regular : {} ", p);
}
return isRegular;
})
.peek(p -> log.trace("Found file: {}", p))
.toList(); // 立即消费Stream
if (files.isEmpty()) {
log.warn("No files found in directory: {}", rootPath);
} else log.debug("Found {} files in {}", files.size(), rootPath);
files.forEach(file -> {
if (cancelled) {
log.debug("Cancelled scanning file {}", file);
return;
}
try {
log.debug("Handle file {}", file);
listener.onFileFound(file);
// 进度更新
if (listener instanceof ProgressAwareListener progressListener) {
int current = processed.incrementAndGet();
progressListener.onProgressUpdate(
current,
totalFiles != null ? totalFiles.intValue() : -1
);
}
} catch (Exception e) {
log.debug("Error Handle file {}", file, e);
listener.onError(file, e);
}
});
if (!cancelled) {
log.debug("Finished scanning files in {}", rootPath);
listener.onScanComplete();
}
} catch (IOException e) {
listener.onError(rootPath, e);
} catch (Exception e) {
log.error("Unexpected error in scan thread", e);
listener.onError(rootPath, e);
}
});
log.debug("Task submitted to thread pool");
}
public void cancel() {
cancelled = true;
forkJoinPool.shutdownNow();
}
@Override
public void close() {
forkJoinPool.shutdown();
try {
if (!forkJoinPool.awaitTermination(1, TimeUnit.SECONDS)) {
forkJoinPool.shutdownNow();
}
} catch (InterruptedException e) {
forkJoinPool.shutdownNow();
Thread.currentThread().interrupt();
}
cancel();
}
}

View File

@ -0,0 +1,141 @@
package top.r3944realms.docchecktoolrefactored.io.scanner;
import lombok.extern.slf4j.Slf4j;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.AccessDeniedException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
@Slf4j
public class RobustParallelScanner implements FileScanner, AutoCloseable {
private final ForkJoinPool forkJoinPool;
private volatile boolean cancelled = false;
private final int maxDepth;
public RobustParallelScanner(int maxDepth) {
this(Runtime.getRuntime().availableProcessors(), maxDepth);
}
public RobustParallelScanner(int parallelism, int maxDepth) {
this.forkJoinPool = new ForkJoinPool(parallelism);
this.maxDepth = maxDepth; // 防止无限递归
}
@Override
public void scan(Path rootPath, FileScanListener listener) {
scanInternal(rootPath, listener, null);
}
@Override
public void scanWithProgress(Path rootPath, ProgressAwareListener listener) {
// 预扫描阶段计算总文件数
AtomicLong totalFiles = new AtomicLong(0);
countFiles(rootPath, totalFiles);
scanInternal(rootPath, listener, totalFiles);
}
private void countFiles(Path dir, AtomicLong counter) {
if (cancelled) return;
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
for (Path path : stream) {
if (cancelled) return;
if (Files.isDirectory(path)) {
countFiles(path, counter);
} else if (Files.isRegularFile(path)) {
counter.incrementAndGet();
}
}
} catch (IOException e) {
log.warn("Failed to pre-scan: {}", dir, e);
}
}
private void scanInternal(Path rootPath, FileScanListener listener, AtomicLong totalFiles) {
try {
validateDirectory(rootPath);
forkJoinPool.submit(() -> {
try {
AtomicInteger processedFiles = new AtomicInteger(0);
scanDirectory(rootPath, listener, processedFiles, totalFiles, 0);
if (!cancelled) {
listener.onScanComplete();
}
} catch (Exception e) {
listener.onError(rootPath, e);
}
}).get(30, TimeUnit.SECONDS);
} catch (TimeoutException e) {
log.error("Scan timeout: {}", rootPath, e);
forkJoinPool.shutdownNow();
listener.onError(rootPath, new TimeoutException("扫描超时30秒"));
} catch (Exception e) {
listener.onError(rootPath, e);
}
}
private void scanDirectory(Path dir, FileScanListener listener,
AtomicInteger processedFiles, AtomicLong totalFiles, int currentDepth) {
if (cancelled || currentDepth > maxDepth) return;
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
for (Path path : stream) {
if (cancelled) break;
if (Files.isDirectory(path)) {
scanDirectory(path, listener, processedFiles, totalFiles, currentDepth + 1);
} else if (Files.isRegularFile(path)) {
processFile(path, listener, processedFiles, totalFiles);
}
}
} catch (IOException e) {
listener.onError(dir, e);
}
}
private void processFile(Path file, FileScanListener listener,
AtomicInteger processedFiles, AtomicLong totalFiles) {
if (cancelled) return;
try {
listener.onFileFound(file);
// 进度更新处理
if (listener instanceof ProgressAwareListener progressListener && totalFiles != null) {
int processed = processedFiles.incrementAndGet();
long total = totalFiles.get();
progressListener.onProgressUpdate(processed, (int)total);
}
} catch (Exception e) {
listener.onError(file, e);
}
}
private void validateDirectory(Path path) throws IOException {
if (!Files.exists(path)) {
throw new FileNotFoundException(path.toString());
}
if (!Files.isReadable(path)) {
throw new AccessDeniedException(path.toString());
}
// 检查是否是挂载点
if (Files.getFileStore(path).type().equals("NTFS") &&
path.toString().contains("$")) {
throw new IOException("系统目录禁止访问: " + path);
}
}
public void cancel() {
cancelled = true;
forkJoinPool.shutdownNow();
}
@Override
public void close() {
cancel();
}
}

View File

@ -123,7 +123,7 @@ public class SceneManager {
}
/**
* Try get scene handler.
* Try to get scene handler.
*
* @param node the node
* @param handler the handler

View File

@ -0,0 +1,27 @@
package top.r3944realms.docchecktoolrefactored.util;
public class StringUtil {
public static String NO_BUG = """
_ooOoo_
o8888888o
88" . "88
(| -_- |)
O\\ = /O
____/`---'\\____
.' \\\\| |// `.
/ \\\\||| : |||// \\
/ _||||| -:- |||||- \\
| | \\\\\\ - /// | |
| \\_| ''\\---/'' | |
\\ .-\\__ `-` ___/-. /
___`. .' /--.--\\ `. . __
."" '< `.___\\_<|>_/___.' >'"".
| | : `- \\`.;`\\ _ /`;.`/ - ` : | |
\\ \\ `-. \\_ __\\ /__ _/ .-` / /
======`-.____`-.___\\_____/___.-`____.-'======
`=---='
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
佛祖保佑 永无BUG
""";
}

View File

@ -0,0 +1,9 @@
module top.r3944realms.docchecktoolrefactored.test {
requires static lombok;
requires org.slf4j;
requires top.r3944realms.docchecktoolrefactored;
requires org.junit.jupiter.api;
exports top.r3944realms.docchecktoolrefactored.test;
opens top.r3944realms.docchecktoolrefactored.test;
}

View File

@ -0,0 +1,67 @@
package top.r3944realms.docchecktoolrefactored.test;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.RepeatedTest;
import top.r3944realms.docchecktoolrefactored.core.DuplicateFinder;
import top.r3944realms.docchecktoolrefactored.core.FileHashCalculator;
import top.r3944realms.docchecktoolrefactored.io.scanner.FileScanner;
import top.r3944realms.docchecktoolrefactored.io.scanner.RobustParallelScanner;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class DuplicateFinderPerformanceTest {
private static final String TEST_PATH = "D:/测试数据/JPG";
private FileScanner scanner;
private FileHashCalculator hashCalculator;
@BeforeEach
void setUp() {
scanner = new RobustParallelScanner(20);
hashCalculator = FileHashCalculator.defaultInstance();
}
@RepeatedTest(5)
void compareFinderPerformance() throws IOException {
// Test finder WITHOUT pre-counting
long startWithoutPrecount = System.nanoTime();
DuplicateFinder finderWithoutPrecount = new DuplicateFinder(scanner, hashCalculator, false);
finderWithoutPrecount.findDuplicates(Paths.get(TEST_PATH));
long durationWithoutPrecount = System.nanoTime() - startWithoutPrecount;
// Test finder WITH pre-counting
long startWithPrecount = System.nanoTime();
DuplicateFinder finderWithPrecount = new DuplicateFinder(scanner, hashCalculator, true);
finderWithPrecount.findDuplicates(Paths.get(TEST_PATH));
long durationWithPrecount = System.nanoTime() - startWithPrecount;
// Convert to milliseconds
long msWithout = TimeUnit.NANOSECONDS.toMillis(durationWithoutPrecount);
long msWith = TimeUnit.NANOSECONDS.toMillis(durationWithPrecount);
System.out.println("Without pre-counting: " + msWithout + " ms");
System.out.println("With pre-counting: " + msWith + " ms");
// // Assert that pre-counting provides benefit
// assertTrue(msWith < msWithout * 1.2,
// "Pre-counting version should not be more than 20% slower");
}
@Test
void verifySameResults() throws IOException {
DuplicateFinder finder1 = new DuplicateFinder(scanner, hashCalculator, false);
var result1 = finder1.findDuplicates(Paths.get(TEST_PATH));
DuplicateFinder finder2 = new DuplicateFinder(scanner, hashCalculator, true);
var result2 = finder2.findDuplicates(Paths.get(TEST_PATH));
// // Verify both methods find the same duplicates
// assertTrue(result1.containsAll(result2) && result2.containsAll(result1),
// "Both methods should find the same duplicate files");
}
}

View File

@ -0,0 +1,36 @@
package top.r3944realms.docchecktoolrefactored.test;
import lombok.extern.slf4j.Slf4j;
import top.r3944realms.docchecktoolrefactored.core.DuplicateFinder;
import top.r3944realms.docchecktoolrefactored.core.FileHashCalculator;
import top.r3944realms.docchecktoolrefactored.io.scanner.FileScanner;
import top.r3944realms.docchecktoolrefactored.io.scanner.RobustParallelScanner;
import top.r3944realms.docchecktoolrefactored.model.DuplicateGroup;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
@Slf4j
public class DuplicateTest {
public static void main(String[] args) throws IOException {
// 创建组件
FileScanner scanner1 = new RobustParallelScanner(20);
FileHashCalculator hashCalculator1 = FileHashCalculator.defaultInstance();
FileScanner scanner2 = new RobustParallelScanner(20);
FileHashCalculator hashCalculator2 = FileHashCalculator.defaultInstance();
// 执行查重
DuplicateFinder finder = new DuplicateFinder(scanner1, hashCalculator1);
DuplicateFinder finder2 = new DuplicateFinder(scanner2, hashCalculator2, true);
List<DuplicateGroup> duplicates = finder.findDuplicates(Paths.get("H:\\nw0\\newworld(1)(1)"));
// 处理结果
duplicates.forEach(group -> {
log.info("发现重复文件组({} bytes):", group.size());
group.fileMetas().forEach(file ->
log.info(" {}", file.getPath())
);
});
}
}

View File

@ -0,0 +1,32 @@
package top.r3944realms.docchecktoolrefactored.test;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
@Slf4j
public class ParallelFileScannerTest {
public static void main(String[] args) throws IOException {
test1(Path.of("D:/测试数据/JPG"));
}
private static void test1(Path rootPath) throws IOException {
@SuppressWarnings("resource") List<Path> files = Files.walk(rootPath)
.peek(p -> log.trace("visiting: {}", p))
.parallel()
.filter(p -> {
boolean isRegular = Files.isRegularFile(p);
if (!isRegular) {
log.debug("Skip non-regular : {} ", p);
}
return isRegular;
})
.peek(p -> log.trace("Found file: {}", p))
.toList(); // 立即消费Stream
if (files.isEmpty()) {
log.warn("No files found in directory: {}", rootPath);
} else log.debug("Found {} files in {}", files.size(), rootPath);
}
}