HTMLHelper.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. using System;
  2. using System.Text;
  3. using System.Net;
  4. using System.IO;
  5. using System.Threading;
  6. using System.Text.RegularExpressions;
  7. namespace Ant.Service.Utilities
  8. {
  9. public class HTMLHelper
  10. {
  11. #region 私有字段
  12. private static CookieContainer cc = new CookieContainer();
  13. private static string contentType = "application/x-www-form-urlencoded";
  14. private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
  15. private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
  16. private static Encoding encoding = Encoding.GetEncoding("utf-8");
  17. private static int delay = 1000;
  18. private static int maxTry = 300;
  19. private static int currentTry = 0;
  20. #endregion
  21. #region 公有属性
  22. /// <summary>
  23. /// Cookie
  24. /// </summary>
  25. public static CookieContainer CookieContainer
  26. {
  27. get
  28. {
  29. return cc;
  30. }
  31. }
  32. /// <summary>
  33. /// 语言
  34. /// </summary>
  35. public static Encoding Encoding
  36. {
  37. get
  38. {
  39. return encoding;
  40. }
  41. set
  42. {
  43. encoding = value;
  44. }
  45. }
  46. public static int NetworkDelay
  47. {
  48. get
  49. {
  50. Random r = new Random();
  51. return (r.Next(delay, delay * 2));
  52. }
  53. set
  54. {
  55. delay = value;
  56. }
  57. }
  58. public static int MaxTry
  59. {
  60. get
  61. {
  62. return maxTry;
  63. }
  64. set
  65. {
  66. maxTry = value;
  67. }
  68. }
  69. #endregion
  70. #region 获取HTML
  71. /// <summary>
  72. /// 获取HTML
  73. /// </summary>
  74. /// <param name="url">地址</param>
  75. /// <param name="postData">post 提交的字符串</param>
  76. /// <param name="isPost">是否是post</param>
  77. /// <param name="cookieContainer">CookieContainer</param>
  78. public static string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
  79. {
  80. if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
  81. Thread.Sleep(NetworkDelay);
  82. currentTry++;
  83. HttpWebRequest httpWebRequest = null;
  84. HttpWebResponse httpWebResponse = null;
  85. try
  86. {
  87. byte[] byteRequest = Encoding.Default.GetBytes(postData);
  88. httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
  89. httpWebRequest.CookieContainer = cookieContainer;
  90. httpWebRequest.ContentType = contentType;
  91. httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
  92. httpWebRequest.Referer = url;
  93. httpWebRequest.Accept = accept;
  94. httpWebRequest.UserAgent = userAgent;
  95. httpWebRequest.Method = isPost ? "POST" : "GET";
  96. httpWebRequest.ContentLength = byteRequest.Length;
  97. Stream stream = httpWebRequest.GetRequestStream();
  98. stream.Write(byteRequest, 0, byteRequest.Length);
  99. stream.Close();
  100. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  101. Stream responseStream = httpWebResponse.GetResponseStream();
  102. StreamReader streamReader = new StreamReader(responseStream, encoding);
  103. string html = streamReader.ReadToEnd();
  104. streamReader.Close();
  105. responseStream.Close();
  106. currentTry = 0;
  107. httpWebRequest.Abort();
  108. httpWebResponse.Close();
  109. return html;
  110. }
  111. catch (Exception e)
  112. {
  113. if (currentTry <= maxTry) GetHtml(url, postData, isPost, cookieContainer);
  114. currentTry--;
  115. if (httpWebRequest != null) httpWebRequest.Abort();
  116. if (httpWebResponse != null) httpWebResponse.Close();
  117. return string.Empty;
  118. }
  119. }
  120. /// <summary>
  121. /// 获取HTML
  122. /// </summary>
  123. /// <param name="url">地址</param>
  124. /// <param name="cookieContainer">CookieContainer</param>
  125. public static string GetHtml(string url, CookieContainer cookieContainer)
  126. {
  127. Thread.Sleep(NetworkDelay);
  128. currentTry++;
  129. HttpWebRequest httpWebRequest = null;
  130. HttpWebResponse httpWebResponse = null;
  131. try
  132. {
  133. httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
  134. httpWebRequest.CookieContainer = cookieContainer;
  135. httpWebRequest.ContentType = contentType;
  136. httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
  137. httpWebRequest.Referer = url;
  138. httpWebRequest.Accept = accept;
  139. httpWebRequest.UserAgent = userAgent;
  140. httpWebRequest.Method = "GET";
  141. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  142. Stream responseStream = httpWebResponse.GetResponseStream();
  143. StreamReader streamReader = new StreamReader(responseStream, encoding);
  144. string html = streamReader.ReadToEnd();
  145. streamReader.Close();
  146. responseStream.Close();
  147. currentTry--;
  148. httpWebRequest.Abort();
  149. httpWebResponse.Close();
  150. return html;
  151. }
  152. catch (Exception e)
  153. {
  154. if (currentTry <= maxTry) GetHtml(url, cookieContainer);
  155. currentTry--;
  156. if (httpWebRequest != null) httpWebRequest.Abort();
  157. if (httpWebResponse != null) httpWebResponse.Close();
  158. return string.Empty;
  159. }
  160. }
  161. #endregion
  162. #region 获取字符流
  163. /// <summary>
  164. /// 获取字符流
  165. /// </summary>
  166. //---------------------------------------------------------------------------------------------------------------
  167. // 示例:
  168. // System.Net.CookieContainer cookie = new System.Net.CookieContainer();
  169. // Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);
  170. // picVerify.Image = Image.FromStream(s);
  171. //---------------------------------------------------------------------------------------------------------------
  172. /// <param name="url">地址</param>
  173. /// <param name="cookieContainer">cookieContainer</param>
  174. public static Stream GetStream(string url, CookieContainer cookieContainer)
  175. {
  176. currentTry++;
  177. HttpWebRequest httpWebRequest = null;
  178. HttpWebResponse httpWebResponse = null;
  179. try
  180. {
  181. httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
  182. httpWebRequest.CookieContainer = cookieContainer;
  183. httpWebRequest.ContentType = contentType;
  184. httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
  185. httpWebRequest.Referer = url;
  186. httpWebRequest.Accept = accept;
  187. httpWebRequest.UserAgent = userAgent;
  188. httpWebRequest.Method = "GET";
  189. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  190. Stream responseStream = httpWebResponse.GetResponseStream();
  191. currentTry--;
  192. return responseStream;
  193. }
  194. catch (Exception e)
  195. {
  196. if (currentTry <= maxTry)
  197. {
  198. GetHtml(url, cookieContainer);
  199. }
  200. currentTry--;
  201. if (httpWebRequest != null)
  202. {
  203. httpWebRequest.Abort();
  204. } if (httpWebResponse != null)
  205. {
  206. httpWebResponse.Close();
  207. }
  208. return null;
  209. }
  210. }
  211. #endregion
  212. #region 清除HTML标记
  213. ///<summary>
  214. ///清除HTML标记
  215. ///</summary>
  216. ///<param name="NoHTML">包括HTML的源码</param>
  217. ///<returns>已经去除后的文字</returns>
  218. public static string NoHTML(string Htmlstring)
  219. {
  220. //删除脚本
  221. Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
  222. //删除HTML
  223. Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
  224. Htmlstring = regex.Replace(Htmlstring, "");
  225. Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  226. Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
  227. Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  228. Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  229. Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
  230. Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  231. Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  232. Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  233. Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  234. Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  235. Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  236. Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  237. Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  238. Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  239. Htmlstring.Replace("<", "");
  240. Htmlstring.Replace(">", "");
  241. Htmlstring.Replace("\r\n", "");
  242. return Htmlstring;
  243. }
  244. #endregion
  245. #region 匹配页面的链接
  246. /// <summary>
  247. /// 获取页面的链接正则
  248. /// </summary>
  249. public string GetHref(string HtmlCode)
  250. {
  251. string MatchVale = "";
  252. string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
  253. foreach (Match m in Regex.Matches(HtmlCode, Reg))
  254. {
  255. MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
  256. }
  257. return MatchVale;
  258. }
  259. #endregion
  260. #region 匹配页面的图片地址
  261. /// <summary>
  262. /// 匹配页面的图片地址
  263. /// </summary>
  264. /// <param name="imgHttp">要补充的http://路径信息</param>
  265. public string GetImgSrc(string HtmlCode, string imgHttp)
  266. {
  267. string MatchVale = "";
  268. string Reg = @"<img.+?>";
  269. foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
  270. {
  271. MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
  272. }
  273. return MatchVale;
  274. }
  275. /// <summary>
  276. /// 匹配<img src="" />中的图片路径实际链接
  277. /// </summary>
  278. /// <param name="ImgString"><img src="" />字符串</param>
  279. public string GetImg(string ImgString, string imgHttp)
  280. {
  281. string MatchVale = "";
  282. string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
  283. foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
  284. {
  285. MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
  286. }
  287. if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
  288. return (MatchVale);
  289. else
  290. return (imgHttp + MatchVale);
  291. }
  292. #endregion
  293. #region 抓取远程页面内容
  294. /// <summary>
  295. /// 以GET方式抓取远程页面内容
  296. /// </summary>
  297. public static string Get_Http(string tUrl)
  298. {
  299. string strResult;
  300. try
  301. {
  302. HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(tUrl);
  303. hwr.Timeout = 19600;
  304. HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
  305. Stream myStream = hwrs.GetResponseStream();
  306. StreamReader sr = new StreamReader(myStream, Encoding.Default);
  307. StringBuilder sb = new StringBuilder();
  308. while (-1 != sr.Peek())
  309. {
  310. sb.Append(sr.ReadLine() + "\r\n");
  311. }
  312. strResult = sb.ToString();
  313. hwrs.Close();
  314. }
  315. catch (Exception ee)
  316. {
  317. strResult = ee.Message;
  318. }
  319. return strResult;
  320. }
  321. /// <summary>
  322. /// 以POST方式抓取远程页面内容
  323. /// </summary>
  324. /// <param name="postData">参数列表</param>
  325. public static string Post_Http(string url, string postData, string encodeType)
  326. {
  327. string strResult = null;
  328. try
  329. {
  330. Encoding encoding = Encoding.GetEncoding(encodeType);
  331. byte[] POST = encoding.GetBytes(postData);
  332. HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
  333. myRequest.Method = "POST";
  334. myRequest.ContentType = "application/x-www-form-urlencoded";
  335. myRequest.ContentLength = POST.Length;
  336. Stream newStream = myRequest.GetRequestStream();
  337. newStream.Write(POST, 0, POST.Length); //设置POST
  338. newStream.Close();
  339. HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  340. StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
  341. strResult = reader.ReadToEnd();
  342. }
  343. catch (Exception ex)
  344. {
  345. strResult = ex.Message;
  346. }
  347. return strResult;
  348. }
  349. #endregion
  350. #region 压缩HTML输出
  351. /// <summary>
  352. /// 压缩HTML输出
  353. /// </summary>
  354. public static string ZipHtml(string Html)
  355. {
  356. Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
  357. Html = Regex.Replace(Html, @"\r\n\s*", "");
  358. Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
  359. return Html;
  360. }
  361. #endregion
  362. #region 过滤指定HTML标签
  363. /// <summary>
  364. /// 过滤指定HTML标签
  365. /// </summary>
  366. /// <param name="s_TextStr">要过滤的字符</param>
  367. /// <param name="html_Str">a img p div</param>
  368. public static string DelHtml(string s_TextStr, string html_Str)
  369. {
  370. string rStr = "";
  371. if (!string.IsNullOrEmpty(s_TextStr))
  372. {
  373. rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
  374. rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
  375. }
  376. return rStr;
  377. }
  378. #endregion
  379. #region 加载文件块
  380. /// <summary>
  381. /// 加载文件块
  382. /// </summary>
  383. public static string File(string Path, System.Web.UI.Page p)
  384. {
  385. return @p.ResolveUrl(Path);
  386. }
  387. #endregion
  388. #region 加载CSS样式文件
  389. /// <summary>
  390. /// 加载CSS样式文件
  391. /// </summary>
  392. public static string CSS(string cssPath, System.Web.UI.Page p)
  393. {
  394. return @"<link href=""" + p.ResolveUrl(cssPath) + @""" rel=""stylesheet"" type=""text/css"" />" + "\r\n";
  395. }
  396. #endregion
  397. #region 加载JavaScript脚本文件
  398. /// <summary>
  399. /// 加载javascript脚本文件
  400. /// </summary>
  401. public static string JS(string jsPath, System.Web.UI.Page p)
  402. {
  403. return @"<script type=""text/javascript"" src=""" + p.ResolveUrl(jsPath) + @"""></script>" + "\r\n";
  404. }
  405. #endregion
  406. }
  407. }