123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864 |
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.IO;
- using System.Collections;
- using System.Text.RegularExpressions;
- namespace Ant.Service.Utilities
- {
- /// <summary>
- /// 分词类改写自ShootSearch 中文分词组件
- /// </summary>
- //----------------调用----------------------
- //Segment seg = new Segment();
- //seg.InitWordDics();
- //seg.EnablePrefix = true;
- //seg.Separator =" ";
- //seg.SegmentText("字符串", false).Trim();
- //-------------------------------------------
- public class Segment
- {
- #region 私有字段
- private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sDict.dic");
- private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sNoise.dic");
- private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sNumber.dic");
- private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sWord.dic");
- private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sPrefix.dic");
- private Hashtable htWords;
- private ArrayList alNoise;
- private ArrayList alNumber;
- private ArrayList alWord;
- private ArrayList alPrefix;
- private double m_EventTime = 0;
- /// <summary>
- /// 分隔符
- /// </summary>
- private string m_Separator = " ";
- /// <summary>
- /// 用于验证汉字的正则表达式
- /// </summary>
- private string strChinese = "[\u4e00-\u9fa5]";
- #endregion
- #region 公有属性
- /// <summary>
- /// 基本词典路径
- /// </summary>
- public string DicPath
- {
- get
- {
- return m_DicPath;
- }
- set
- {
- m_DicPath = value;
- }
- }
- /// <summary>
- /// 数据缓存函数
- /// </summary>
- /// <param name="key">索引键</param>
- /// <param name="val">缓存的数据</param>
- private static void SetCache(string key, object val)
- {
- if (val == null) val = " ";
- System.Web.HttpContext.Current.Application.Lock();
- System.Web.HttpContext.Current.Application.Set(key, val);
- System.Web.HttpContext.Current.Application.UnLock();
- }
- /// <summary>
- /// 读取缓存
- /// </summary>
- private static object GetCache(string key)
- {
- return System.Web.HttpContext.Current.Application.Get(key);
- }
- /// <summary>
- /// 暂时无用
- /// </summary>
- public string NoisePath
- {
- get
- {
- return m_NoisePath;
- }
- set
- {
- m_NoisePath = value;
- }
- }
- /// <summary>
- /// 数字词典路径
- /// </summary>
- public string NumberPath
- {
- get
- {
- return m_NumberPath;
- }
- set
- {
- m_NumberPath = value;
- }
- }
- /// <summary>
- /// 字母词典路径
- /// </summary>
- public string WordPath
- {
- get
- {
- return m_WordPath;
- }
- set
- {
- m_WordPath = value;
- }
- }
- /// <summary>
- /// 姓名前缀字典 用于纠错姓名
- /// </summary>
- public string PrefixPath
- {
- get
- {
- return m_PrefixPath;
- }
- set
- {
- m_PrefixPath = value;
- }
- }
- /// <summary>
- /// 是否开启姓名纠错功能
- /// </summary>
- public bool EnablePrefix
- {
- get
- {
- if (alPrefix.Count == 0)
- return false;
- else
- return true;
- }
- set
- {
- if (value)
- alPrefix = LoadWords(PrefixPath, alPrefix);
- else
- alPrefix = new ArrayList();
- }
- }
- /// <summary>
- /// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
- /// 已精确到毫秒但分词操作在字符串较短时可能为0
- /// </summary>
- public double EventTime
- {
- get
- {
- return m_EventTime;
- }
- }
- /// <summary>
- /// 分隔符,默认为空格
- /// </summary>
- public string Separator
- {
- get
- {
- return m_Separator;
- }
- set
- {
- if (value != "" && value != null) m_Separator = value;
- }
- }
- #endregion
- #region 构造方法
- /// <summary>
- /// 构造方法
- /// </summary>
- public Segment()
- { }
- /// <summary>
- /// 构造方法
- /// </summary>
- public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
- {
- m_WordPath = p_DicPath;
- m_WordPath = p_NoisePath;
- m_WordPath = p_NumberPath;
- m_WordPath = p_WordPath;
- this.InitWordDics();
- }
- #endregion
- #region 公有方法
- /// <summary>
- /// 加载词列表
- /// </summary>
- public void InitWordDics()
- {
- DateTime start = DateTime.Now;
- if (GetCache("jcms_dict") == null)
- {
- htWords = new Hashtable();
- Hashtable father = htWords;
- Hashtable forfather = htWords;
- string strChar1;
- string strChar2;
- StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
- string strline = reader.ReadLine();
- SegList list;
- Hashtable child = new Hashtable();
- long i = 0;
- while (strline != null && strline.Trim() != "")
- {
- i++;
- strChar1 = strline.Substring(0, 1);
- strChar2 = strline.Substring(1, 1);
- if (!htWords.ContainsKey(strChar1))
- {
- father = new Hashtable();
- htWords.Add(strChar1, father);
- }
- else
- {
- father = (Hashtable)htWords[strChar1];
- }
- if (!father.ContainsKey(strChar2))
- {
- list = new SegList();
- if (strline.Length > 2)
- list.Add(strline.Substring(2));
- else
- list.Add("null");
- father.Add(strChar2, list);
- }
- else
- {
- list = (SegList)father[strChar2];
- if (strline.Length > 2)
- {
- list.Add(strline.Substring(2));
- }
- else
- {
- list.Add("null");
- }
- father[strChar2] = list;
- }
- htWords[strChar1] = father;
- strline = reader.ReadLine();
- }
- try
- {
- reader.Close();
- }
- catch
- { }
- SetCache("jcms_dict", htWords);
- }
- htWords = (Hashtable)GetCache("jcms_dict");
- alNoise = LoadWords(NoisePath, alNoise);
- alNumber = LoadWords(NumberPath, alNumber);
- alWord = LoadWords(WordPath, alWord);
- alPrefix = LoadWords(PrefixPath, alPrefix);
- TimeSpan duration = DateTime.Now - start;
- m_EventTime = duration.TotalMilliseconds;
- }
- /// <summary>
- /// 加载文本词组到ArrayList
- /// </summary>
- public ArrayList LoadWords(string strPath, ArrayList list)
- {
- StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
- list = new ArrayList();
- string strline = reader.ReadLine();
- while (strline != null)
- {
- list.Add(strline);
- strline = reader.ReadLine();
- }
- try
- {
- reader.Close();
- }
- catch
- { }
- return list;
- }
- /// <summary>
- /// 输出词列表
- /// </summary>
- public void OutWords()
- {
- IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
- while (idEnumerator1.MoveNext())
- {
- IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
- while (idEnumerator2.MoveNext())
- {
- SegList aa = (SegList)idEnumerator2.Value;
- for (int i = 0; i < aa.Count; i++)
- {
- Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
- }
- }
- }
- }
- /// <summary>
- /// 输出ArrayList
- /// </summary>
- public void OutArrayList(ArrayList list)
- {
- if (list == null) return;
- for (int i = 0; i < list.Count; i++)
- {
- Console.WriteLine(list[i].ToString());
- }
- }
- /// <summary>
- /// 分词过程,不支持回车
- /// </summary>
- /// <param name="strText">要分词的文本</param>
- /// <returns>分词后的文本</returns>
- public string SegmentText(string strText)
- {
- strText = (strText + "$").Trim();
- if (htWords == null) return strText;
- if (strText.Length < 3) return strText;
- DateTime start = DateTime.Now;
- int length = 0;
- int preFix = 0;
- bool word = false;
- bool number = false;
- string reText = "";
- string strPrefix = "";
- string strLastChar = "";
- string strLastWords = Separator;
- for (int i = 0; i < strText.Length - 1; i++)
- {
- #region 对于每一个字的处理过程
- string strChar1 = strText.Substring(i, 1);
- string strChar2 = strText.Substring(i + 1, 1).Trim();
- bool yes;
- SegList l;
- Hashtable h;
- if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
- if (strChar1 == " ")
- {
- if ((number || word) && strLastChar != Separator) reText += this.Separator;
- yes = true;
- }
- else
- yes = false;
- int CharType = GetCharType(strChar1);
- switch (CharType)
- {
- case 1:
- #region 如果是数字,如果数字的上一位是字母要和后面的数字分开
- if (word)
- {
- reText += Separator;
- }
- word = false;
- number = true;
- strLastWords = "";
- break;
- #endregion
- case 2:
- case 5:
- #region 如果是字母
- if (number)
- strLastWords = Separator;
- else
- strLastWords = "";
- word = true;
- number = false;
- break;
- #endregion
- case 3:
- case 4:
- #region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
- //上一个字是否为字母
- if (word) reText += Separator;
- #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
- if (number && CharType != 4)
- {
- h = (Hashtable)htWords["n"];
- if (h.ContainsKey(strChar1))
- {
- l = (SegList)h[strChar1];
- if (l.Contains(strChar2))
- {
- reText += strChar1 + strChar2 + Separator;
- yes = true;
- i++;
- }
- else if (l.Contains("null"))
- {
- reText += strChar1 + Separator;
- yes = true;
- }
- }
- else
- reText += Separator;
- }
- #endregion
- //非汉字数字的汉字
- if (CharType == 3)
- {
- word = false;
- number = false;
- strLastWords = Separator;
- }
- else
- {
- word = false;
- number = true;
- strLastWords = "";
- }
- //第二级哈希表取出
- h = (Hashtable)htWords[strChar1];
- //第二级哈希表是否包含关键字
- if (h.ContainsKey(strChar2))
- {
- #region 第二级包含关键字
- //取出ArrayList对象
- l = (SegList)h[strChar2];
- //遍历每一个对象 看是否能组合成词
- for (int j = 0; j < l.Count; j++)
- {
- bool have = false;
- string strChar3 = l.GetElem(j).ToString();
- //对于每一个取出的词进行检测,看是否匹配,长度保护
- if ((strChar3.Length + i + 2) < strText.Length)
- {
- //向i+2后取出m长度的字
- string strChar = strText.Substring(i + 2, strChar3.Length).Trim();
- if (strChar3 == strChar && !yes)
- {
- if (strPrefix != "")
- {
- reText += strPrefix + Separator;
- strPrefix = "";
- preFix = 0;
- }
- reText += strChar1 + strChar2 + strChar;
- i += strChar3.Length + 1;
- have = true;
- yes = true;
- break;
- }
- }
- else if ((strChar3.Length + i + 2) == strText.Length)
- {
- string strChar = strText.Substring(i + 2).Trim();
- if (strChar3 == strChar && !yes)
- {
- if (strPrefix != "")
- {
- reText += strPrefix + Separator;
- strPrefix = "";
- preFix = 0;
- }
- reText += strChar1 + strChar2 + strChar;
- i += strChar3.Length + 1;
- have = true;
- yes = true;
- break;
- }
- }
- if (!have && j == l.Count - 1 && l.Contains("null") && !yes)
- {
- if (preFix == 1)
- {
- reText += strPrefix + strChar1 + strChar2;
- strPrefix = "";
- preFix = 0;
- }
- else if (preFix > 1)
- {
- reText += strPrefix + strLastWords + strChar1 + strChar2;
- strPrefix = "";
- preFix = 0;
- }
- else
- {
- if (CharType == 4) reText += strChar1 + strChar2;
- else reText += strChar1 + strChar2;
- strLastWords = this.Separator;
- number = false;
- }
- i++;
- yes = true;
- break;
- }
- else if (have)
- {
- break;
- }
- }
- #endregion
- //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
- if (!yes && l.Contains("null"))
- {
- if (preFix == 1)
- {
- reText += strPrefix + strChar1 + strChar2;
- strPrefix = "";
- preFix = 0;
- }
- else if (preFix > 1)
- {
- reText += strPrefix + strLastWords + strChar1 + strChar2;
- strPrefix = "";
- preFix = 0;
- }
- else
- {
- if (CharType == 4) reText += strChar1 + strChar2;
- else reText += strChar1 + strChar2;
- strLastWords = this.Separator;
- number = false;
- }
- i++;
- yes = true;
- }
- if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
- if (CharType == 4 && GetCharType(strLastChar) == 4)
- {
- number = true;
- }
- else if (strLastChar != this.Separator) reText += this.Separator;
- }
- #endregion
- break;
- default:
- #region 未知字符,可能是生僻字,也可能是标点符合之类
- if (word && !yes)
- {
- reText += Separator;
- }
- else if (number && !yes)
- {
- reText += Separator;
- }
- number = false;
- word = false;
- strLastWords = this.Separator;
- break;
- #endregion
- }
- if (!yes && number || !yes && word)
- {
- reText += strChar1;
- yes = true;
- }
- if (!yes)
- {
- #region 处理姓名问题
- if (preFix == 0)
- {
- if (alPrefix.Contains(strChar1 + strChar2))
- {
- i++;
- strPrefix = strChar1 + strChar2;
- preFix++;
- }
- else if (alPrefix.Contains(strChar1))
- {
- if (!number)
- {
- strPrefix = strChar1;
- preFix++;
- }
- else
- {
- reText += strChar1 + strLastWords;
- number = false;
- word = false;
- }
- }
- else
- {
- if (preFix == 3)
- {
- reText += strPrefix + Separator + strChar1 + Separator;
- strPrefix = "";
- preFix = 0;
- }
- else if (preFix > 0)
- {
- if (Regex.IsMatch(strChar1, strChinese))
- {
- strPrefix += strChar1;
- preFix++;
- }
- else
- {
- reText += strPrefix + Separator + strChar1 + Separator;
- strPrefix = "";
- preFix = 0;
- }
- }
- else
- {
- reText += strChar1 + strLastWords;
- number = false;
- word = false;
- }
- }
- }
- else
- {
- if (preFix == 3)
- {
- reText += strPrefix + Separator + strChar1 + Separator;
- strPrefix = "";
- preFix = 0;
- }
- else if (preFix > 0)
- {
- if (Regex.IsMatch(strChar1, strChinese))
- {
- strPrefix += strChar1;
- preFix++;
- }
- else
- {
- reText += strPrefix + Separator + strChar1 + Separator;
- strPrefix = "";
- preFix = 0;
- }
- }
- else
- {
- reText += strChar1 + strLastWords;
- number = false;
- }
- }
- #endregion
- }
- length = i;
- #endregion
- }
- #region 最后防止最后一个字的丢失
- if (length < strText.Length - 1)
- {
- string strLastChar1 = strText.Substring(strText.Length - 1).Trim();
- string strLastChar2 = strText.Substring(strText.Length - 2).Trim();
- if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
- if (preFix != 0)
- {
- reText += strPrefix + strLastChar1;
- }
- else
- {
- switch (GetCharType(strLastChar1))
- {
- case 1:
- if (strLastChar1 != "." && strLastChar1 != ".")
- reText += strLastChar1;
- else
- reText += Separator + strLastChar1;
- break;
- case 2:
- case 5:
- if (alWord.Contains(strLastChar2))
- reText += strLastChar1;
- break;
- case 3:
- case 4:
- if ((number || word) && strLastChar != Separator)
- reText += Separator + strLastChar1;
- else
- reText += strLastChar1;
- break;
- default:
- if (strLastChar != Separator)
- reText += Separator + strLastChar1;
- else
- reText += strLastChar1;
- break;
- }
- }
- if (reText.Length > 0) strLastChar = (reText.Substring(reText.Length - 1));
- if (strLastChar != this.Separator) reText += this.Separator;
- }
- #endregion
- TimeSpan duration = DateTime.Now - start;
- m_EventTime = duration.TotalMilliseconds;
- return reText.Replace(" $", ""); //这里包含一个字的,则去掉
- }
- /// <summary>
- /// 重载分词过程,支持回车
- /// </summary>
- public string SegmentText(string strText, bool Enter)
- {
- if (Enter)
- {
- DateTime start = DateTime.Now;
- string[] strArr = strText.Split('\n');
- string reText = "";
- for (int i = 0; i < strArr.Length; i++)
- {
- reText += SegmentText(strArr[i]) + "\r\n";
- }
- TimeSpan duration = DateTime.Now - start;
- m_EventTime = duration.TotalMilliseconds;
- return reText;
- }
- else
- {
- return SegmentText(strText);
- }
- }
- #region 判断字符类型
- /// <summary>
- /// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
- /// </summary>
- private int GetCharType(string p_Char)
- {
- int CharType = 0;
- if (alNumber.Contains(p_Char)) CharType = 1;
- if (alWord.Contains(p_Char)) CharType = 2;
- if (htWords.ContainsKey(p_Char)) CharType += 3;
- return CharType;
- }
- #endregion
- #region 对加载的词典排序并重新写入
- /// <summary>
- /// 对加载的词典排序并重新写入
- /// </summary>
- public void SortDic()
- {
- SortDic(false);
- }
- /// <summary>
- /// 对加载的词典排序并重新写入
- /// </summary>
- /// <param name="Reload">是否重新加载</param>
- public void SortDic(bool Reload)
- {
- DateTime start = DateTime.Now;
- StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
- IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
- while (idEnumerator1.MoveNext())
- {
- IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
- while (idEnumerator2.MoveNext())
- {
- SegList aa = (SegList)idEnumerator2.Value;
- aa.Sort();
- for (int i = 0; i < aa.Count; i++)
- {
- if (aa.GetElem(i).ToString() == "null")
- sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
- else
- sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
- }
- }
- }
- sw.Close();
- if (Reload) InitWordDics();
- TimeSpan duration = DateTime.Now - start;
- m_EventTime = duration.TotalMilliseconds;
- }
- #endregion
- /// <summary>
- /// 删除两行完全相同的词,暂时无用!
- /// </summary>
- /// <returns>相同词条个数</returns>
- public int Optimize()
- {
- int l = 0;
- DateTime start = DateTime.Now;
- Hashtable htOptimize = new Hashtable();
- StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
- string strline = reader.ReadLine();
- while (strline != null && strline.Trim() != "")
- {
- if (!htOptimize.ContainsKey(strline))
- htOptimize.Add(strline, null);
- else
- l++;
- }
- Console.WriteLine("ready");
- try
- {
- reader.Close();
- }
- catch { }
- StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
- IDictionaryEnumerator ide = htOptimize.GetEnumerator();
- while (ide.MoveNext())
- sw.WriteLine(ide.Key.ToString());
- try
- {
- sw.Close();
- }
- catch { }
- TimeSpan duration = DateTime.Now - start;
- m_EventTime = duration.TotalMilliseconds;
- return l;
- }
- #endregion
- }
- }
|