From 90cd0a5286fbabb51a843d730c6c7bcf397c356e Mon Sep 17 00:00:00 2001 From: hankcs Date: Tue, 26 May 2015 16:44:25 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hanlp/dictionary/BiGramDictionary.java | 41 +++++++++++++------ .../hanlp/dictionary/CoreDictionary.java | 7 +++- .../hanlp/dictionary/CustomDictionary.java | 15 +++++++ .../com/hankcs/demo/DemoAtFirstSight.java | 1 + .../demo/DemoMultithreadingSegment.java | 5 +++ 5 files changed, 56 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java index f5e080a5e..e379be49f 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java @@ -23,10 +23,13 @@ import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.*; + import static com.hankcs.hanlp.utility.Predefine.logger; /** * 2元语法词典 + * + * @deprecated 现在基于DoubleArrayTrie的BiGramDictionary已经由CoreBiGramTableDictionary替代,可以显著降低内存 * @author hankcs */ public class BiGramDictionary @@ -35,6 +38,7 @@ public class BiGramDictionary public final static String path = HanLP.Config.BiGramDictionaryPath; public static final int totalFrequency = 37545990; + // 自动加载词典 static { @@ -49,9 +53,10 @@ public class BiGramDictionary logger.info(path + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms"); } } + public static boolean load(String path) { - logger.info("二元词典开始加载:"+ path); + logger.info("二元词典开始加载:" + path); trie = new DoubleArrayTrie(); boolean create = !loadDat(path); if (!create) return true; @@ -70,18 +75,20 @@ public static boolean load(String path) } br.close(); logger.info("二元词典读取完毕:" + path + ",开始构建双数组Trie树(DoubleArrayTrie)……"); - } catch (FileNotFoundException e) + } + catch (FileNotFoundException e) { - logger.severe("二元词典" + path + "不存在!"+ e); + logger.severe("二元词典" + path + "不存在!" + e); return false; - } catch (IOException e) + } + catch (IOException e) { - logger.severe("二元词典" + path + "读取错误!"+ e); + logger.severe("二元词典" + path + "读取错误!" + e); return false; } int resultCode = trie.build(map); - logger.info("二元词典DAT构建结果:{}"+ resultCode); + logger.info("二元词典DAT构建结果:{}" + resultCode); // reSaveDictionary(map, path); logger.info("二元词典加载成功:" + trie.size() + "个词条"); if (create) @@ -109,6 +116,7 @@ public static boolean load(String path) /** * 从dat文件中加载排好的trie + * * @param path * @return */ @@ -137,8 +145,9 @@ private static boolean loadDat(String path) /** * 找寻特殊字串,如未##串 - * @deprecated 没事就不要用了 + * * @return 一个包含特殊词串的set + * @deprecated 没事就不要用了 */ public static Set _findSpecialString() { @@ -162,10 +171,12 @@ public static Set _findSpecialString() } } br.close(); - } catch (FileNotFoundException e) + } + catch (FileNotFoundException e) { e.printStackTrace(); - } catch (IOException e) + } + catch (IOException e) { e.printStackTrace(); } @@ -175,8 +186,9 @@ public static Set _findSpecialString() /** * 获取共现频次 + * * @param from 第一个词 - * @param to 第二个词 + * @param to 第二个词 * @return 第一个词@第二个词出现的频次 */ public static int getBiFrequency(String from, String to) @@ -186,6 +198,7 @@ public static int getBiFrequency(String from, String to) /** * 获取共现频次 + * * @param twoWord 用@隔开的两个词 * @return 共现频次 */ @@ -197,6 +210,7 @@ public static int getBiFrequency(String twoWord) /** * 将NGram词典重新写回去 + * * @param map * @param path * @return @@ -217,6 +231,7 @@ private static boolean reSaveDictionary(TreeMap map, String pat /** * 接受键数组与值数组,排序以供建立trie树 + * * @param wordList * @param freqList */ @@ -238,10 +253,12 @@ private static void sortListForBuildTrie(List wordList, List fr bw.newLine(); } bw.close(); - } catch (FileNotFoundException e) + } + catch (FileNotFoundException e) { e.printStackTrace(); - } catch (IOException e) + } + catch (IOException e) { e.printStackTrace(); } diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java index 88407e743..e93071f56 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java @@ -57,7 +57,7 @@ public class CoreDictionary public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER); public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER); - public static boolean load(String path) + private static boolean load(String path) { logger.info("核心词典开始加载:" + path); if (loadDat(path)) return true; @@ -196,6 +196,11 @@ public static int getTermFrequency(String term) return attribute.totalFrequency; } + /** + * 是否包含词语 + * @param key + * @return + */ public static boolean contains(String key) { return trie.get(key) != null; diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java index 489cebe51..44910772f 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java @@ -330,12 +330,22 @@ public String toString() '}'; } + /** + * 词典中是否含有词语 + * @param key 词语 + * @return 是否包含 + */ public static boolean contains(String key) { if (dat.exactMatchSearch(key) >= 0) return true; return trie != null && trie.containsKey(key); } + /** + * 获取一个BinTrie的查询工具 + * @param charArray 文本 + * @return 查询者 + */ public static BaseSearcher getSearcher(char[] charArray) { return new Searcher(charArray); @@ -399,6 +409,11 @@ public static BinTrie getTrie() return trie; } + /** + * 解析一段文本(目前采用了BinTrie+DAT的混合储存形式,此方法可以统一两个数据结构) + * @param text 文本 + * @param processor 处理器 + */ public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) { if (trie != null) diff --git a/src/test/java/com/hankcs/demo/DemoAtFirstSight.java b/src/test/java/com/hankcs/demo/DemoAtFirstSight.java index dde579a8c..70092e107 100644 --- a/src/test/java/com/hankcs/demo/DemoAtFirstSight.java +++ b/src/test/java/com/hankcs/demo/DemoAtFirstSight.java @@ -15,6 +15,7 @@ /** * 第一个Demo,惊鸿一瞥 + * * @author hankcs */ public class DemoAtFirstSight diff --git a/src/test/java/com/hankcs/demo/DemoMultithreadingSegment.java b/src/test/java/com/hankcs/demo/DemoMultithreadingSegment.java index cd7b19151..c1923d0fe 100644 --- a/src/test/java/com/hankcs/demo/DemoMultithreadingSegment.java +++ b/src/test/java/com/hankcs/demo/DemoMultithreadingSegment.java @@ -57,5 +57,10 @@ public static void main(String[] args) costTime = (System.currentTimeMillis() - start) / (double) 1000; System.out.printf("多线程分词速度:%.2f字每秒\n", text.length() / costTime); System.gc(); + + // Note: + // 内部的并行化机制可以对1万字以上的大文本开启多线程分词 + // 另一方面,HanLP中的任何Segment本身都是线程安全的。 + // 你可以开10个线程用同一个CRFSegment对象切分任意文本,不需要任何线程同步的措施,每个线程都可以得到正确的结果。 } }