From 27158e3174b2641dc63206a087910077d1f253af Mon Sep 17 00:00:00 2001 From: hankcs Date: Wed, 9 Sep 2015 17:05:34 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A7=AF=E7=B4=AF=E4=BA=86=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E5=B0=8F=E4=BC=98=E5=8C=96=EF=BC=8C=E5=B0=8F=E7=89=88=E6=9C=AC?= =?UTF-8?q?+1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 11 +- .../hankcs/hanlp/corpus/io/EasyReader.java | 114 ++++++++++++++++++ .../com/hankcs/hanlp/corpus/io/IOUtil.java | 57 +++++++++ .../hankcs/hanlp/corpus/io/LineHandler.java | 54 +++++++++ .../java/com/hankcs/test/model/TestCRF.java | 38 ++++-- 5 files changed, 266 insertions(+), 8 deletions(-) create mode 100644 src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java create mode 100644 src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java diff --git a/pom.xml b/pom.xml index a29c51e6e..de65e9474 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.hankcs hanlp - 1.2.4 + 1.2.5 HanLP http://www.hankcs.com/ @@ -79,6 +79,15 @@ + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.18.1 + + true + + diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java b/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java new file mode 100644 index 000000000..52c29a99e --- /dev/null +++ b/src/main/java/com/hankcs/hanlp/corpus/io/EasyReader.java @@ -0,0 +1,114 @@ +/* + * + * He Han + * me@hankcs.com + * 2015/7/29 16:35 + * + * + * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/ + * This source is subject to Hankcs. Please contact Hankcs to get more information. + * + */ +package com.hankcs.hanlp.corpus.io; + +import java.io.File; +import java.io.FileFilter; + +/** + * 文本读取工具 + * @author hankcs + */ +public class EasyReader +{ + /** + * 根目录 + */ + String root; + /** + * 是否输出进度 + */ + boolean verbose = true; + + /** + * 构造 + * @param root 根目录 + */ + public EasyReader(String root) + { + this.root = root; + } + + /** + * 构造 + * @param root 根目录 + * @param verbose 是否输出进度 + */ + public EasyReader(String root, boolean verbose) + { + this.root = root; + this.verbose = verbose; + } + + /** + * 读取 + * @param handler 处理逻辑 + * @param size 读取多少个文件 + * @throws Exception + */ + public void read(LineHandler handler, int size) throws Exception + { + File rootFile = new File(root); + File[] files; + if (rootFile.isDirectory()) + { + files = rootFile.listFiles(new FileFilter() + { + @Override + public boolean accept(File pathname) + { + return pathname.isFile() && !pathname.getName().endsWith(".bin"); + } + }); + if (files == null) + { + if (rootFile.isFile()) + files = new File[]{rootFile}; + else return; + } + } + else + { + files = new File[]{rootFile}; + } + + int n = 0; + int totalAddress = 0; + long start = System.currentTimeMillis(); + for (File file : files) + { + if (size-- == 0) break; + if (file.isDirectory()) continue; + if (verbose) System.out.printf("正在处理%s, %d / %d\n", file.getName(), ++n, files.length); + IOUtil.LineIterator lineIterator = new IOUtil.LineIterator(file.getAbsolutePath()); + while (lineIterator.hasNext()) + { + ++totalAddress; + String line = lineIterator.next(); + if (line.length() == 0) continue; + handler.handle(line); + } + } + handler.done(); + if (verbose) System.out.printf("处理了 %.2f 万行,花费了 %.2f min\n", totalAddress / 10000.0, (System.currentTimeMillis() - start) / 1000.0 / 60.0); + } + + /** + * 读取 + * @param handler 处理逻辑 + * @throws Exception + */ + public void read(LineHandler handler) throws Exception + { + read(handler, Integer.MAX_VALUE); + } +} diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java b/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java index 0486eb9ed..b235966ac 100644 --- a/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java +++ b/src/main/java/com/hankcs/hanlp/corpus/io/IOUtil.java @@ -361,4 +361,61 @@ public void remove() throw new UnsupportedOperationException("只读,不可写!"); } } + + /** + * 创建一个BufferedWriter + * + * @param path + * @return + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public static BufferedWriter newBufferedWriter(String path) throws FileNotFoundException, UnsupportedEncodingException + { + return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8")); + } + + /** + * 创建一个BufferedReader + * @param path + * @return + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public static BufferedReader newBufferedReader(String path) throws FileNotFoundException, UnsupportedEncodingException + { + return new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); + } + + public static BufferedWriter newBufferedWriter(String path, boolean append) throws FileNotFoundException, UnsupportedEncodingException + { + return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, append), "UTF-8")); + } + + /** + * 获取最后一个分隔符的后缀 + * @param name + * @param delimiter + * @return + */ + public static String getSuffix(String name, String delimiter) + { + return name.substring(name.lastIndexOf(delimiter) + 1); + } + + /** + * 写数组,用制表符分割 + * @param bw + * @param params + * @throws IOException + */ + public static void writeLine(BufferedWriter bw, String... params) throws IOException + { + for (int i = 0; i < params.length - 1; i++) + { + bw.write(params[i]); + bw.write('\t'); + } + bw.write(params[params.length - 1]); + } } diff --git a/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java b/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java new file mode 100644 index 000000000..7ae7f57f3 --- /dev/null +++ b/src/main/java/com/hankcs/hanlp/corpus/io/LineHandler.java @@ -0,0 +1,54 @@ +/* + * + * He Han + * me@hankcs.com + * 2015/7/29 16:37 + * + * + * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/ + * This source is subject to Hankcs. Please contact Hankcs to get more information. + * + */ +package com.hankcs.hanlp.corpus.io; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +/** + * @author hankcs + */ +public abstract class LineHandler +{ + String delimiter = "\t"; + + public LineHandler(String delimiter) + { + this.delimiter = delimiter; + } + + public LineHandler() + { + } + + public void handle(String line) throws Exception + { + List tokenList = new LinkedList(); + int start = 0; + int end; + while ((end = line.indexOf(delimiter, start)) != -1) + { + tokenList.add(line.substring(start, end)); + start = end + 1; + } + tokenList.add(line.substring(start, line.length())); + handle(tokenList.toArray(new String[0])); + } + + public void done() throws IOException + { + // do noting + } + + public abstract void handle(String[] params) throws IOException; +} diff --git a/src/test/java/com/hankcs/test/model/TestCRF.java b/src/test/java/com/hankcs/test/model/TestCRF.java index 96be8e2e9..7106efe74 100644 --- a/src/test/java/com/hankcs/test/model/TestCRF.java +++ b/src/test/java/com/hankcs/test/model/TestCRF.java @@ -13,11 +13,15 @@ import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.collection.trie.bintrie.BinTrie; +import com.hankcs.hanlp.corpus.dictionary.EasyDictionary; import com.hankcs.hanlp.corpus.document.CorpusLoader; import com.hankcs.hanlp.corpus.document.Document; import com.hankcs.hanlp.corpus.document.sentence.word.IWord; import com.hankcs.hanlp.corpus.document.sentence.word.Word; import com.hankcs.hanlp.corpus.io.ByteArray; +import com.hankcs.hanlp.corpus.io.EasyReader; +import com.hankcs.hanlp.corpus.io.IOUtil; +import com.hankcs.hanlp.corpus.io.LineHandler; import com.hankcs.hanlp.corpus.util.Precompiler; import com.hankcs.hanlp.model.crf.FeatureFunction; import com.hankcs.hanlp.model.crf.FeatureTemplate; @@ -93,8 +97,8 @@ public void testSegment() throws Exception */ public void testPrepareCRFTrainingCorpus() throws Exception { - final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014人民日报语料BMES切分.txt"), "UTF-8")); - CorpusLoader.walk("H:\\seg_corpus", new CorpusLoader.Handler() + final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("e:\\2014.txt"), "UTF-8")); + CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs", new CorpusLoader.Handler() { @Override public void handle(Document document) @@ -102,8 +106,10 @@ public void handle(Document document) try { List> sentenceList = document.getSimpleSentenceList(); + if (sentenceList.size() == 0) return; for (List sentence : sentenceList) { + if (sentence.size() == 0) continue; for (IWord iWord : sentence) { String word = iWord.getValue(); @@ -118,28 +124,28 @@ public void handle(Document document) bw.write(word); bw.write('\t'); bw.write('S'); - bw.newLine(); + bw.write('\n'); } else { bw.write(word.charAt(0)); bw.write('\t'); bw.write('B'); - bw.newLine(); + bw.write('\n'); for (int i = 1; i < word.length() - 1; ++i) { bw.write(word.charAt(i)); bw.write('\t'); bw.write('M'); - bw.newLine(); + bw.write('\n'); } bw.write(word.charAt(word.length() - 1)); bw.write('\t'); bw.write('E'); - bw.newLine(); + bw.write('\n'); } } - bw.newLine(); + bw.write('\n'); } } catch (IOException e) @@ -187,4 +193,22 @@ public void testLoadModelWithBiGramFeature() throws Exception model.tag(table); System.out.println(table); } + + public void testRemoveSpace() throws Exception + { + String inputPath = "E:\\2014.txt"; + String outputPath = "E:\\2014f.txt"; + BufferedReader br = IOUtil.newBufferedReader(inputPath); + BufferedWriter bw = IOUtil.newBufferedWriter(outputPath); + String line = ""; + int preLength = 0; + while ((line = br.readLine()) != null) + { + if (preLength == 0 && line.length() == 0) continue; + bw.write(line); + bw.newLine(); + preLength = line.length(); + } + bw.close(); + } }