Skip to content

Commit

Permalink
完善文档
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed May 26, 2015
1 parent 9100577 commit 90cd0a5
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 13 deletions.
41 changes: 29 additions & 12 deletions src/main/java/com/hankcs/hanlp/dictionary/BiGramDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
* 2元语法词典
*
* @deprecated 现在基于DoubleArrayTrie的BiGramDictionary已经由CoreBiGramTableDictionary替代,可以显著降低内存
* @author hankcs
*/
public class BiGramDictionary
Expand All @@ -35,6 +38,7 @@ public class BiGramDictionary

public final static String path = HanLP.Config.BiGramDictionaryPath;
public static final int totalFrequency = 37545990;

// 自动加载词典
static
{
Expand All @@ -49,9 +53,10 @@ public class BiGramDictionary
logger.info(path + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms");
}
}

public static boolean load(String path)
{
logger.info("二元词典开始加载:"+ path);
logger.info("二元词典开始加载:" + path);
trie = new DoubleArrayTrie<Integer>();
boolean create = !loadDat(path);
if (!create) return true;
Expand All @@ -70,18 +75,20 @@ public static boolean load(String path)
}
br.close();
logger.info("二元词典读取完毕:" + path + ",开始构建双数组Trie树(DoubleArrayTrie)……");
} catch (FileNotFoundException e)
}
catch (FileNotFoundException e)
{
logger.severe("二元词典" + path + "不存在!"+ e);
logger.severe("二元词典" + path + "不存在!" + e);
return false;
} catch (IOException e)
}
catch (IOException e)
{
logger.severe("二元词典" + path + "读取错误!"+ e);
logger.severe("二元词典" + path + "读取错误!" + e);
return false;
}

int resultCode = trie.build(map);
logger.info("二元词典DAT构建结果:{}"+ resultCode);
logger.info("二元词典DAT构建结果:{}" + resultCode);
// reSaveDictionary(map, path);
logger.info("二元词典加载成功:" + trie.size() + "个词条");
if (create)
Expand Down Expand Up @@ -109,6 +116,7 @@ public static boolean load(String path)

/**
* 从dat文件中加载排好的trie
*
* @param path
* @return
*/
Expand Down Expand Up @@ -137,8 +145,9 @@ private static boolean loadDat(String path)

/**
* 找寻特殊字串,如未##串
* @deprecated 没事就不要用了
*
* @return 一个包含特殊词串的set
* @deprecated 没事就不要用了
*/
public static Set<String> _findSpecialString()
{
Expand All @@ -162,10 +171,12 @@ public static Set<String> _findSpecialString()
}
}
br.close();
} catch (FileNotFoundException e)
}
catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
}
catch (IOException e)
{
e.printStackTrace();
}
Expand All @@ -175,8 +186,9 @@ public static Set<String> _findSpecialString()

/**
* 获取共现频次
*
* @param from 第一个词
* @param to 第二个词
* @param to 第二个词
* @return 第一个词@第二个词出现的频次
*/
public static int getBiFrequency(String from, String to)
Expand All @@ -186,6 +198,7 @@ public static int getBiFrequency(String from, String to)

/**
* 获取共现频次
*
* @param twoWord 用@隔开的两个词
* @return 共现频次
*/
Expand All @@ -197,6 +210,7 @@ public static int getBiFrequency(String twoWord)

/**
* 将NGram词典重新写回去
*
* @param map
* @param path
* @return
Expand All @@ -217,6 +231,7 @@ private static boolean reSaveDictionary(TreeMap<String, Integer> map, String pat

/**
* 接受键数组与值数组,排序以供建立trie树
*
* @param wordList
* @param freqList
*/
Expand All @@ -238,10 +253,12 @@ private static void sortListForBuildTrie(List<String> wordList, List<Integer> fr
bw.newLine();
}
bw.close();
} catch (FileNotFoundException e)
}
catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
}
catch (IOException e)
{
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public class CoreDictionary
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);

public static boolean load(String path)
private static boolean load(String path)
{
logger.info("核心词典开始加载:" + path);
if (loadDat(path)) return true;
Expand Down Expand Up @@ -196,6 +196,11 @@ public static int getTermFrequency(String term)
return attribute.totalFrequency;
}

/**
* 是否包含词语
* @param key
* @return
*/
public static boolean contains(String key)
{
return trie.get(key) != null;
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,22 @@ public String toString()
'}';
}

/**
* 词典中是否含有词语
* @param key 词语
* @return 是否包含
*/
public static boolean contains(String key)
{
if (dat.exactMatchSearch(key) >= 0) return true;
return trie != null && trie.containsKey(key);
}

/**
* 获取一个BinTrie的查询工具
* @param charArray 文本
* @return 查询者
*/
public static BaseSearcher getSearcher(char[] charArray)
{
return new Searcher(charArray);
Expand Down Expand Up @@ -399,6 +409,11 @@ public static BinTrie<CoreDictionary.Attribute> getTrie()
return trie;
}

/**
* 解析一段文本(目前采用了BinTrie+DAT的混合储存形式,此方法可以统一两个数据结构)
* @param text 文本
* @param processor 处理器
*/
public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
{
if (trie != null)
Expand Down
1 change: 1 addition & 0 deletions src/test/java/com/hankcs/demo/DemoAtFirstSight.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

/**
* 第一个Demo,惊鸿一瞥
*
* @author hankcs
*/
public class DemoAtFirstSight
Expand Down
5 changes: 5 additions & 0 deletions src/test/java/com/hankcs/demo/DemoMultithreadingSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,10 @@ public static void main(String[] args)
costTime = (System.currentTimeMillis() - start) / (double) 1000;
System.out.printf("多线程分词速度:%.2f字每秒\n", text.length() / costTime);
System.gc();

// Note:
// 内部的并行化机制可以对1万字以上的大文本开启多线程分词
// 另一方面,HanLP中的任何Segment本身都是线程安全的。
// 你可以开10个线程用同一个CRFSegment对象切分任意文本,不需要任何线程同步的措施,每个线程都可以得到正确的结果。
}
}

0 comments on commit 90cd0a5

Please sign in to comment.