Segment.cs 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using System.IO;
  6. using System.Collections;
  7. using System.Text.RegularExpressions;
  8. namespace Ant.Service.Utilities
  9. {
  10. /// <summary>
  11. /// 分词类改写自ShootSearch 中文分词组件
  12. /// </summary>
  13. //----------------调用----------------------
  14. //Segment seg = new Segment();
  15. //seg.InitWordDics();
  16. //seg.EnablePrefix = true;
  17. //seg.Separator =" ";
  18. //seg.SegmentText("字符串", false).Trim();
  19. //-------------------------------------------
  20. public class Segment
  21. {
  22. #region 私有字段
  23. private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sDict.dic");
  24. private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sNoise.dic");
  25. private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sNumber.dic");
  26. private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sWord.dic");
  27. private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("ShootSeg/sPrefix.dic");
  28. private Hashtable htWords;
  29. private ArrayList alNoise;
  30. private ArrayList alNumber;
  31. private ArrayList alWord;
  32. private ArrayList alPrefix;
  33. private double m_EventTime = 0;
  34. /// <summary>
  35. /// 分隔符
  36. /// </summary>
  37. private string m_Separator = " ";
  38. /// <summary>
  39. /// 用于验证汉字的正则表达式
  40. /// </summary>
  41. private string strChinese = "[\u4e00-\u9fa5]";
  42. #endregion
  43. #region 公有属性
  44. /// <summary>
  45. /// 基本词典路径
  46. /// </summary>
  47. public string DicPath
  48. {
  49. get
  50. {
  51. return m_DicPath;
  52. }
  53. set
  54. {
  55. m_DicPath = value;
  56. }
  57. }
  58. /// <summary>
  59. /// 数据缓存函数
  60. /// </summary>
  61. /// <param name="key">索引键</param>
  62. /// <param name="val">缓存的数据</param>
  63. private static void SetCache(string key, object val)
  64. {
  65. if (val == null) val = " ";
  66. System.Web.HttpContext.Current.Application.Lock();
  67. System.Web.HttpContext.Current.Application.Set(key, val);
  68. System.Web.HttpContext.Current.Application.UnLock();
  69. }
  70. /// <summary>
  71. /// 读取缓存
  72. /// </summary>
  73. private static object GetCache(string key)
  74. {
  75. return System.Web.HttpContext.Current.Application.Get(key);
  76. }
  77. /// <summary>
  78. /// 暂时无用
  79. /// </summary>
  80. public string NoisePath
  81. {
  82. get
  83. {
  84. return m_NoisePath;
  85. }
  86. set
  87. {
  88. m_NoisePath = value;
  89. }
  90. }
  91. /// <summary>
  92. /// 数字词典路径
  93. /// </summary>
  94. public string NumberPath
  95. {
  96. get
  97. {
  98. return m_NumberPath;
  99. }
  100. set
  101. {
  102. m_NumberPath = value;
  103. }
  104. }
  105. /// <summary>
  106. /// 字母词典路径
  107. /// </summary>
  108. public string WordPath
  109. {
  110. get
  111. {
  112. return m_WordPath;
  113. }
  114. set
  115. {
  116. m_WordPath = value;
  117. }
  118. }
  119. /// <summary>
  120. /// 姓名前缀字典 用于纠错姓名
  121. /// </summary>
  122. public string PrefixPath
  123. {
  124. get
  125. {
  126. return m_PrefixPath;
  127. }
  128. set
  129. {
  130. m_PrefixPath = value;
  131. }
  132. }
  133. /// <summary>
  134. /// 是否开启姓名纠错功能
  135. /// </summary>
  136. public bool EnablePrefix
  137. {
  138. get
  139. {
  140. if (alPrefix.Count == 0)
  141. return false;
  142. else
  143. return true;
  144. }
  145. set
  146. {
  147. if (value)
  148. alPrefix = LoadWords(PrefixPath, alPrefix);
  149. else
  150. alPrefix = new ArrayList();
  151. }
  152. }
  153. /// <summary>
  154. /// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
  155. /// 已精确到毫秒但分词操作在字符串较短时可能为0
  156. /// </summary>
  157. public double EventTime
  158. {
  159. get
  160. {
  161. return m_EventTime;
  162. }
  163. }
  164. /// <summary>
  165. /// 分隔符,默认为空格
  166. /// </summary>
  167. public string Separator
  168. {
  169. get
  170. {
  171. return m_Separator;
  172. }
  173. set
  174. {
  175. if (value != "" && value != null) m_Separator = value;
  176. }
  177. }
  178. #endregion
  179. #region 构造方法
  180. /// <summary>
  181. /// 构造方法
  182. /// </summary>
  183. public Segment()
  184. { }
  185. /// <summary>
  186. /// 构造方法
  187. /// </summary>
  188. public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
  189. {
  190. m_WordPath = p_DicPath;
  191. m_WordPath = p_NoisePath;
  192. m_WordPath = p_NumberPath;
  193. m_WordPath = p_WordPath;
  194. this.InitWordDics();
  195. }
  196. #endregion
  197. #region 公有方法
  198. /// <summary>
  199. /// 加载词列表
  200. /// </summary>
  201. public void InitWordDics()
  202. {
  203. DateTime start = DateTime.Now;
  204. if (GetCache("jcms_dict") == null)
  205. {
  206. htWords = new Hashtable();
  207. Hashtable father = htWords;
  208. Hashtable forfather = htWords;
  209. string strChar1;
  210. string strChar2;
  211. StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
  212. string strline = reader.ReadLine();
  213. SegList list;
  214. Hashtable child = new Hashtable();
  215. long i = 0;
  216. while (strline != null && strline.Trim() != "")
  217. {
  218. i++;
  219. strChar1 = strline.Substring(0, 1);
  220. strChar2 = strline.Substring(1, 1);
  221. if (!htWords.ContainsKey(strChar1))
  222. {
  223. father = new Hashtable();
  224. htWords.Add(strChar1, father);
  225. }
  226. else
  227. {
  228. father = (Hashtable)htWords[strChar1];
  229. }
  230. if (!father.ContainsKey(strChar2))
  231. {
  232. list = new SegList();
  233. if (strline.Length > 2)
  234. list.Add(strline.Substring(2));
  235. else
  236. list.Add("null");
  237. father.Add(strChar2, list);
  238. }
  239. else
  240. {
  241. list = (SegList)father[strChar2];
  242. if (strline.Length > 2)
  243. {
  244. list.Add(strline.Substring(2));
  245. }
  246. else
  247. {
  248. list.Add("null");
  249. }
  250. father[strChar2] = list;
  251. }
  252. htWords[strChar1] = father;
  253. strline = reader.ReadLine();
  254. }
  255. try
  256. {
  257. reader.Close();
  258. }
  259. catch
  260. { }
  261. SetCache("jcms_dict", htWords);
  262. }
  263. htWords = (Hashtable)GetCache("jcms_dict");
  264. alNoise = LoadWords(NoisePath, alNoise);
  265. alNumber = LoadWords(NumberPath, alNumber);
  266. alWord = LoadWords(WordPath, alWord);
  267. alPrefix = LoadWords(PrefixPath, alPrefix);
  268. TimeSpan duration = DateTime.Now - start;
  269. m_EventTime = duration.TotalMilliseconds;
  270. }
  271. /// <summary>
  272. /// 加载文本词组到ArrayList
  273. /// </summary>
  274. public ArrayList LoadWords(string strPath, ArrayList list)
  275. {
  276. StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
  277. list = new ArrayList();
  278. string strline = reader.ReadLine();
  279. while (strline != null)
  280. {
  281. list.Add(strline);
  282. strline = reader.ReadLine();
  283. }
  284. try
  285. {
  286. reader.Close();
  287. }
  288. catch
  289. { }
  290. return list;
  291. }
  292. /// <summary>
  293. /// 输出词列表
  294. /// </summary>
  295. public void OutWords()
  296. {
  297. IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
  298. while (idEnumerator1.MoveNext())
  299. {
  300. IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
  301. while (idEnumerator2.MoveNext())
  302. {
  303. SegList aa = (SegList)idEnumerator2.Value;
  304. for (int i = 0; i < aa.Count; i++)
  305. {
  306. Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
  307. }
  308. }
  309. }
  310. }
  311. /// <summary>
  312. /// 输出ArrayList
  313. /// </summary>
  314. public void OutArrayList(ArrayList list)
  315. {
  316. if (list == null) return;
  317. for (int i = 0; i < list.Count; i++)
  318. {
  319. Console.WriteLine(list[i].ToString());
  320. }
  321. }
  322. /// <summary>
  323. /// 分词过程,不支持回车
  324. /// </summary>
  325. /// <param name="strText">要分词的文本</param>
  326. /// <returns>分词后的文本</returns>
  327. public string SegmentText(string strText)
  328. {
  329. strText = (strText + "$").Trim();
  330. if (htWords == null) return strText;
  331. if (strText.Length < 3) return strText;
  332. DateTime start = DateTime.Now;
  333. int length = 0;
  334. int preFix = 0;
  335. bool word = false;
  336. bool number = false;
  337. string reText = "";
  338. string strPrefix = "";
  339. string strLastChar = "";
  340. string strLastWords = Separator;
  341. for (int i = 0; i < strText.Length - 1; i++)
  342. {
  343. #region 对于每一个字的处理过程
  344. string strChar1 = strText.Substring(i, 1);
  345. string strChar2 = strText.Substring(i + 1, 1).Trim();
  346. bool yes;
  347. SegList l;
  348. Hashtable h;
  349. if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
  350. if (strChar1 == " ")
  351. {
  352. if ((number || word) && strLastChar != Separator) reText += this.Separator;
  353. yes = true;
  354. }
  355. else
  356. yes = false;
  357. int CharType = GetCharType(strChar1);
  358. switch (CharType)
  359. {
  360. case 1:
  361. #region 如果是数字,如果数字的上一位是字母要和后面的数字分开
  362. if (word)
  363. {
  364. reText += Separator;
  365. }
  366. word = false;
  367. number = true;
  368. strLastWords = "";
  369. break;
  370. #endregion
  371. case 2:
  372. case 5:
  373. #region 如果是字母
  374. if (number)
  375. strLastWords = Separator;
  376. else
  377. strLastWords = "";
  378. word = true;
  379. number = false;
  380. break;
  381. #endregion
  382. case 3:
  383. case 4:
  384. #region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
  385. //上一个字是否为字母
  386. if (word) reText += Separator;
  387. #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
  388. if (number && CharType != 4)
  389. {
  390. h = (Hashtable)htWords["n"];
  391. if (h.ContainsKey(strChar1))
  392. {
  393. l = (SegList)h[strChar1];
  394. if (l.Contains(strChar2))
  395. {
  396. reText += strChar1 + strChar2 + Separator;
  397. yes = true;
  398. i++;
  399. }
  400. else if (l.Contains("null"))
  401. {
  402. reText += strChar1 + Separator;
  403. yes = true;
  404. }
  405. }
  406. else
  407. reText += Separator;
  408. }
  409. #endregion
  410. //非汉字数字的汉字
  411. if (CharType == 3)
  412. {
  413. word = false;
  414. number = false;
  415. strLastWords = Separator;
  416. }
  417. else
  418. {
  419. word = false;
  420. number = true;
  421. strLastWords = "";
  422. }
  423. //第二级哈希表取出
  424. h = (Hashtable)htWords[strChar1];
  425. //第二级哈希表是否包含关键字
  426. if (h.ContainsKey(strChar2))
  427. {
  428. #region 第二级包含关键字
  429. //取出ArrayList对象
  430. l = (SegList)h[strChar2];
  431. //遍历每一个对象 看是否能组合成词
  432. for (int j = 0; j < l.Count; j++)
  433. {
  434. bool have = false;
  435. string strChar3 = l.GetElem(j).ToString();
  436. //对于每一个取出的词进行检测,看是否匹配,长度保护
  437. if ((strChar3.Length + i + 2) < strText.Length)
  438. {
  439. //向i+2后取出m长度的字
  440. string strChar = strText.Substring(i + 2, strChar3.Length).Trim();
  441. if (strChar3 == strChar && !yes)
  442. {
  443. if (strPrefix != "")
  444. {
  445. reText += strPrefix + Separator;
  446. strPrefix = "";
  447. preFix = 0;
  448. }
  449. reText += strChar1 + strChar2 + strChar;
  450. i += strChar3.Length + 1;
  451. have = true;
  452. yes = true;
  453. break;
  454. }
  455. }
  456. else if ((strChar3.Length + i + 2) == strText.Length)
  457. {
  458. string strChar = strText.Substring(i + 2).Trim();
  459. if (strChar3 == strChar && !yes)
  460. {
  461. if (strPrefix != "")
  462. {
  463. reText += strPrefix + Separator;
  464. strPrefix = "";
  465. preFix = 0;
  466. }
  467. reText += strChar1 + strChar2 + strChar;
  468. i += strChar3.Length + 1;
  469. have = true;
  470. yes = true;
  471. break;
  472. }
  473. }
  474. if (!have && j == l.Count - 1 && l.Contains("null") && !yes)
  475. {
  476. if (preFix == 1)
  477. {
  478. reText += strPrefix + strChar1 + strChar2;
  479. strPrefix = "";
  480. preFix = 0;
  481. }
  482. else if (preFix > 1)
  483. {
  484. reText += strPrefix + strLastWords + strChar1 + strChar2;
  485. strPrefix = "";
  486. preFix = 0;
  487. }
  488. else
  489. {
  490. if (CharType == 4) reText += strChar1 + strChar2;
  491. else reText += strChar1 + strChar2;
  492. strLastWords = this.Separator;
  493. number = false;
  494. }
  495. i++;
  496. yes = true;
  497. break;
  498. }
  499. else if (have)
  500. {
  501. break;
  502. }
  503. }
  504. #endregion
  505. //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
  506. if (!yes && l.Contains("null"))
  507. {
  508. if (preFix == 1)
  509. {
  510. reText += strPrefix + strChar1 + strChar2;
  511. strPrefix = "";
  512. preFix = 0;
  513. }
  514. else if (preFix > 1)
  515. {
  516. reText += strPrefix + strLastWords + strChar1 + strChar2;
  517. strPrefix = "";
  518. preFix = 0;
  519. }
  520. else
  521. {
  522. if (CharType == 4) reText += strChar1 + strChar2;
  523. else reText += strChar1 + strChar2;
  524. strLastWords = this.Separator;
  525. number = false;
  526. }
  527. i++;
  528. yes = true;
  529. }
  530. if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
  531. if (CharType == 4 && GetCharType(strLastChar) == 4)
  532. {
  533. number = true;
  534. }
  535. else if (strLastChar != this.Separator) reText += this.Separator;
  536. }
  537. #endregion
  538. break;
  539. default:
  540. #region 未知字符,可能是生僻字,也可能是标点符合之类
  541. if (word && !yes)
  542. {
  543. reText += Separator;
  544. }
  545. else if (number && !yes)
  546. {
  547. reText += Separator;
  548. }
  549. number = false;
  550. word = false;
  551. strLastWords = this.Separator;
  552. break;
  553. #endregion
  554. }
  555. if (!yes && number || !yes && word)
  556. {
  557. reText += strChar1;
  558. yes = true;
  559. }
  560. if (!yes)
  561. {
  562. #region 处理姓名问题
  563. if (preFix == 0)
  564. {
  565. if (alPrefix.Contains(strChar1 + strChar2))
  566. {
  567. i++;
  568. strPrefix = strChar1 + strChar2;
  569. preFix++;
  570. }
  571. else if (alPrefix.Contains(strChar1))
  572. {
  573. if (!number)
  574. {
  575. strPrefix = strChar1;
  576. preFix++;
  577. }
  578. else
  579. {
  580. reText += strChar1 + strLastWords;
  581. number = false;
  582. word = false;
  583. }
  584. }
  585. else
  586. {
  587. if (preFix == 3)
  588. {
  589. reText += strPrefix + Separator + strChar1 + Separator;
  590. strPrefix = "";
  591. preFix = 0;
  592. }
  593. else if (preFix > 0)
  594. {
  595. if (Regex.IsMatch(strChar1, strChinese))
  596. {
  597. strPrefix += strChar1;
  598. preFix++;
  599. }
  600. else
  601. {
  602. reText += strPrefix + Separator + strChar1 + Separator;
  603. strPrefix = "";
  604. preFix = 0;
  605. }
  606. }
  607. else
  608. {
  609. reText += strChar1 + strLastWords;
  610. number = false;
  611. word = false;
  612. }
  613. }
  614. }
  615. else
  616. {
  617. if (preFix == 3)
  618. {
  619. reText += strPrefix + Separator + strChar1 + Separator;
  620. strPrefix = "";
  621. preFix = 0;
  622. }
  623. else if (preFix > 0)
  624. {
  625. if (Regex.IsMatch(strChar1, strChinese))
  626. {
  627. strPrefix += strChar1;
  628. preFix++;
  629. }
  630. else
  631. {
  632. reText += strPrefix + Separator + strChar1 + Separator;
  633. strPrefix = "";
  634. preFix = 0;
  635. }
  636. }
  637. else
  638. {
  639. reText += strChar1 + strLastWords;
  640. number = false;
  641. }
  642. }
  643. #endregion
  644. }
  645. length = i;
  646. #endregion
  647. }
  648. #region 最后防止最后一个字的丢失
  649. if (length < strText.Length - 1)
  650. {
  651. string strLastChar1 = strText.Substring(strText.Length - 1).Trim();
  652. string strLastChar2 = strText.Substring(strText.Length - 2).Trim();
  653. if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
  654. if (preFix != 0)
  655. {
  656. reText += strPrefix + strLastChar1;
  657. }
  658. else
  659. {
  660. switch (GetCharType(strLastChar1))
  661. {
  662. case 1:
  663. if (strLastChar1 != "." && strLastChar1 != ".")
  664. reText += strLastChar1;
  665. else
  666. reText += Separator + strLastChar1;
  667. break;
  668. case 2:
  669. case 5:
  670. if (alWord.Contains(strLastChar2))
  671. reText += strLastChar1;
  672. break;
  673. case 3:
  674. case 4:
  675. if ((number || word) && strLastChar != Separator)
  676. reText += Separator + strLastChar1;
  677. else
  678. reText += strLastChar1;
  679. break;
  680. default:
  681. if (strLastChar != Separator)
  682. reText += Separator + strLastChar1;
  683. else
  684. reText += strLastChar1;
  685. break;
  686. }
  687. }
  688. if (reText.Length > 0) strLastChar = (reText.Substring(reText.Length - 1));
  689. if (strLastChar != this.Separator) reText += this.Separator;
  690. }
  691. #endregion
  692. TimeSpan duration = DateTime.Now - start;
  693. m_EventTime = duration.TotalMilliseconds;
  694. return reText.Replace(" $", ""); //这里包含一个字的,则去掉
  695. }
  696. /// <summary>
  697. /// 重载分词过程,支持回车
  698. /// </summary>
  699. public string SegmentText(string strText, bool Enter)
  700. {
  701. if (Enter)
  702. {
  703. DateTime start = DateTime.Now;
  704. string[] strArr = strText.Split('\n');
  705. string reText = "";
  706. for (int i = 0; i < strArr.Length; i++)
  707. {
  708. reText += SegmentText(strArr[i]) + "\r\n";
  709. }
  710. TimeSpan duration = DateTime.Now - start;
  711. m_EventTime = duration.TotalMilliseconds;
  712. return reText;
  713. }
  714. else
  715. {
  716. return SegmentText(strText);
  717. }
  718. }
  719. #region 判断字符类型
  720. /// <summary>
  721. /// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
  722. /// </summary>
  723. private int GetCharType(string p_Char)
  724. {
  725. int CharType = 0;
  726. if (alNumber.Contains(p_Char)) CharType = 1;
  727. if (alWord.Contains(p_Char)) CharType = 2;
  728. if (htWords.ContainsKey(p_Char)) CharType += 3;
  729. return CharType;
  730. }
  731. #endregion
  732. #region 对加载的词典排序并重新写入
  733. /// <summary>
  734. /// 对加载的词典排序并重新写入
  735. /// </summary>
  736. public void SortDic()
  737. {
  738. SortDic(false);
  739. }
  740. /// <summary>
  741. /// 对加载的词典排序并重新写入
  742. /// </summary>
  743. /// <param name="Reload">是否重新加载</param>
  744. public void SortDic(bool Reload)
  745. {
  746. DateTime start = DateTime.Now;
  747. StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
  748. IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
  749. while (idEnumerator1.MoveNext())
  750. {
  751. IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
  752. while (idEnumerator2.MoveNext())
  753. {
  754. SegList aa = (SegList)idEnumerator2.Value;
  755. aa.Sort();
  756. for (int i = 0; i < aa.Count; i++)
  757. {
  758. if (aa.GetElem(i).ToString() == "null")
  759. sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
  760. else
  761. sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
  762. }
  763. }
  764. }
  765. sw.Close();
  766. if (Reload) InitWordDics();
  767. TimeSpan duration = DateTime.Now - start;
  768. m_EventTime = duration.TotalMilliseconds;
  769. }
  770. #endregion
  771. /// <summary>
  772. /// 删除两行完全相同的词,暂时无用!
  773. /// </summary>
  774. /// <returns>相同词条个数</returns>
  775. public int Optimize()
  776. {
  777. int l = 0;
  778. DateTime start = DateTime.Now;
  779. Hashtable htOptimize = new Hashtable();
  780. StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
  781. string strline = reader.ReadLine();
  782. while (strline != null && strline.Trim() != "")
  783. {
  784. if (!htOptimize.ContainsKey(strline))
  785. htOptimize.Add(strline, null);
  786. else
  787. l++;
  788. }
  789. Console.WriteLine("ready");
  790. try
  791. {
  792. reader.Close();
  793. }
  794. catch { }
  795. StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
  796. IDictionaryEnumerator ide = htOptimize.GetEnumerator();
  797. while (ide.MoveNext())
  798. sw.WriteLine(ide.Key.ToString());
  799. try
  800. {
  801. sw.Close();
  802. }
  803. catch { }
  804. TimeSpan duration = DateTime.Now - start;
  805. m_EventTime = duration.TotalMilliseconds;
  806. return l;
  807. }
  808. #endregion
  809. }
  810. }