Main.java 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. package me.yoqi.pdf;
  2. import java.io.File;
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. import java.util.ArrayList;
  6. import java.util.List;
  7. import org.apache.pdfbox.pdmodel.PDDocument;
  8. import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
  9. import org.apache.pdfbox.text.PDFTextStripper;
  10. /**
  11. * 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
  12. *
  13. * @author liuyuqi
  14. *
  15. */
  16. public class Main {
  17. // 项目目录
  18. private String projectPath;
  19. private String resultFile;
  20. private List<String> suffixList = new ArrayList<String>();
  21. public static void main(String[] args) {
  22. Main m = new Main();
  23. m.init();
  24. m.bathGetText();
  25. }
  26. /**
  27. * 初始化参数
  28. */
  29. public void init() {
  30. projectPath = "E:\\data\\workspace\\PDFOperation\\data";
  31. resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
  32. suffixList.add(".pdf");// 增加后缀
  33. }
  34. // 保存的结果,输出
  35. public void outputData(String fileName, String content) {
  36. try {
  37. //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
  38. FileWriter writer = new FileWriter(fileName, true);
  39. writer.write(content);
  40. writer.close();
  41. } catch (IOException e) {
  42. e.printStackTrace();
  43. }
  44. }
  45. public void bathGetText() {
  46. String filedir = this.projectPath;
  47. if (null == filedir || "".equals(filedir.trim())) {
  48. System.out.println("filedir 目录不对!");
  49. return;
  50. }
  51. filedir = filedir.trim();
  52. if (null == suffixList || suffixList.size() <= 0) {
  53. System.out.println("suffixList 没有要匹配的后缀!");
  54. return;
  55. }
  56. File f = new File(filedir);
  57. if (f.isDirectory()) {
  58. handleDirectory(f);
  59. } else {
  60. System.out.println("filedir 必须为目录");
  61. }
  62. }
  63. private void handleDirectory(File filedir) {
  64. // 目录
  65. File[] files = filedir.listFiles();
  66. for (File subFile : files) {
  67. if (subFile.isDirectory()) {
  68. handleDirectory(subFile);
  69. } else {
  70. // 文件
  71. for (String suffix : suffixList) {
  72. if (subFile.getName().endsWith(suffix)) {
  73. System.out.println(subFile.getName());
  74. getTextFromPDF(subFile);
  75. }
  76. }
  77. }
  78. }
  79. }
  80. /**
  81. * 处理单个pdf为字符串
  82. *
  83. * @param pdfFilePath
  84. * pdf文件路径
  85. */
  86. public void getTextFromPDF(File pdfFile) {
  87. PDDocument document = null;
  88. // 方式二:
  89. try {
  90. document = PDDocument.load(pdfFile);
  91. // 获取页码
  92. int pages = document.getNumberOfPages();
  93. System.out.println(pages);
  94. // 读文本内容
  95. PDFTextStripper stripper = new PDFTextStripper();
  96. // 设置按顺序输出
  97. stripper.setSortByPosition(true);
  98. stripper.setStartPage(1);
  99. stripper.setEndPage(pages);
  100. String content = stripper.getText(document);
  101. // System.out.println(content);
  102. outputData(resultFile, content);
  103. } catch (InvalidPasswordException e) {
  104. System.out.println(121);
  105. } catch (Exception e) {
  106. System.out.println(123331);
  107. }
  108. }
  109. private void stringWithOutChinese(){
  110. StringForChinese chinese=new StringForChinese();
  111. chinese.subStrWithOutChinese(str);
  112. }
  113. }