123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- package me.yoqi.pdf;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
- import org.apache.pdfbox.text.PDFTextStripper;
- /**
- * 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
- *
- * @author liuyuqi
- *
- */
- public class Main {
- // 项目目录
- private String projectPath;
- private String resultFile;
- private List<String> suffixList = new ArrayList<String>();
- public static void main(String[] args) {
- Main m = new Main();
- m.init();
- m.bathGetText();
- }
- /**
- * 初始化参数
- */
- public void init() {
- projectPath = "E:\\data\\workspace\\PDFOperation\\data";
- resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
- suffixList.add(".pdf");// 增加后缀
- }
- // 保存的结果,输出
- public void outputData(String fileName, String content) {
- try {
- //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
- FileWriter writer = new FileWriter(fileName, true);
- writer.write(content);
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void bathGetText() {
- String filedir = this.projectPath;
- if (null == filedir || "".equals(filedir.trim())) {
- System.out.println("filedir 目录不对!");
- return;
- }
- filedir = filedir.trim();
- if (null == suffixList || suffixList.size() <= 0) {
- System.out.println("suffixList 没有要匹配的后缀!");
- return;
- }
- File f = new File(filedir);
- if (f.isDirectory()) {
- handleDirectory(f);
- } else {
- System.out.println("filedir 必须为目录");
- }
- }
- private void handleDirectory(File filedir) {
- // 目录
- File[] files = filedir.listFiles();
- for (File subFile : files) {
- if (subFile.isDirectory()) {
- handleDirectory(subFile);
- } else {
- // 文件
- for (String suffix : suffixList) {
- if (subFile.getName().endsWith(suffix)) {
- System.out.println(subFile.getName());
- getTextFromPDF(subFile);
- }
- }
- }
- }
- }
- /**
- * 处理单个pdf为字符串
- *
- * @param pdfFilePath
- * pdf文件路径
- */
- public void getTextFromPDF(File pdfFile) {
- PDDocument document = null;
- // 方式二:
- try {
- document = PDDocument.load(pdfFile);
- // 获取页码
- int pages = document.getNumberOfPages();
- System.out.println(pages);
- // 读文本内容
- PDFTextStripper stripper = new PDFTextStripper();
- // 设置按顺序输出
- stripper.setSortByPosition(true);
- stripper.setStartPage(1);
- stripper.setEndPage(pages);
- String content = stripper.getText(document);
- // System.out.println(content);
- outputData(resultFile, content);
- } catch (InvalidPasswordException e) {
- System.out.println(121);
- } catch (Exception e) {
- System.out.println(123331);
- }
- }
- private void stringWithOutChinese(){
- StringForChinese chinese=new StringForChinese();
- chinese.subStrWithOutChinese(str);
- }
- }
|