liuyuqi-dellpc 7 years ago
parent
commit
d513412b38

+ 21 - 0
.classpath

@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="java/src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/classes" path="src">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>

+ 1 - 0
.gitignore

@@ -1,2 +1,3 @@
 /bin/
 /target/
+/.settings

+ 35 - 0
.project

@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>PDFOperation</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.wst.common.project.facet.core.builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>

+ 8 - 0
.pydevproject

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?><pydev_project>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/${PROJECT_DIR_NAME}/python</path>
+</pydev_pathproperty>
+</pydev_project>

+ 6 - 0
data/说明.txt

@@ -0,0 +1,6 @@
+数据待处理文件夹。文件包含:
+
+	429-499(69).rar
+	法例.rar
+	国际条约.rar
+	基本法.rar

+ 4 - 0
src/me/yoqi/pdf/Main.java → java/src/me/yoqi/pdf/Main.java

@@ -120,5 +120,9 @@ public class Main {
 
 		}
 	}
+	private void stringWithOutChinese(){
+		StringForChinese chinese=new  StringForChinese();
+		chinese.subStrWithOutChinese(str);
+	}
 
 }

+ 23 - 0
java/src/me/yoqi/pdf/StringForChinese.java

@@ -0,0 +1,23 @@
+package me.yoqi.pdf;
+
+public class StringForChinese {
+	
+	
+	/**去除字符串中的中文
+	 * @param str 原字符串
+	 * @return 处理后的字符串(文本)
+	 */
+	public String subStrWithOutChinese(String str) {
+		String string = "";
+		for (int i = 0; i < str.length(); i++) {
+			String str0 = "";
+			if (str.substring(i, i + 1).matches("[\u4e00-\u9fa5]+")) {
+//				System.out.println();
+			} else {
+				str0 = str.substring(i, i + 1) + "";
+			}
+			string += str0;
+		}
+		return string;
+	}
+}

+ 0 - 0
src/me/yoqi/pdf/Test.java → java/src/me/yoqi/pdf/Test.java


+ 2 - 0
output/说明.txt

@@ -0,0 +1,2 @@
+输出文件夹,包含文件:
+ 1、

+ 19 - 0
python/txtConvert.py

@@ -0,0 +1,19 @@
+#coding=utf-8
+'''
+Created on 2017年8月29日
+@vsersion:python 3.6
+@author: liuyuqi
+'''
+
+def readTxt(filePath):
+    f=open(filePath)
+    text=f.read()
+    f.close()
+    return text
+
+def writeTxt():
+    f=open('../data/data.txt', 'w')
+    f.write('hello========hello======\n  he llo\n')
+    f.close()#一定要记得close,否则其他软件打开为只读
+
+

+ 8 - 0
readme.md

@@ -6,6 +6,14 @@
 * 2、批量读取所有pdf文档,对文档每一句进行划分,随机提取100条句子。
 
 ## 使用:
+java 代码实现
+	1、pdf文件读取(python包很不成熟,试过几个一些pdf会出现异常),
+	2、去除空格
+	3、去除中文字符
+
+python 代码实现
+	分句
+	分词
 * 1、clone代码
 * 2、设置main.java中的项目目录,输出文件。
 * 3、运行main代码即可。