123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436 |
- using System;
- using System.Text;
- using System.Net;
- using System.IO;
- using System.Threading;
- using System.Text.RegularExpressions;
- namespace Ant.Service.Utilities
- {
- public class HTMLHelper
- {
- #region 私有字段
- private static CookieContainer cc = new CookieContainer();
- private static string contentType = "application/x-www-form-urlencoded";
- private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
- private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
- private static Encoding encoding = Encoding.GetEncoding("utf-8");
- private static int delay = 1000;
- private static int maxTry = 300;
- private static int currentTry = 0;
- #endregion
- #region 公有属性
- /// <summary>
- /// Cookie
- /// </summary>
- public static CookieContainer CookieContainer
- {
- get
- {
- return cc;
- }
- }
- /// <summary>
- /// 语言
- /// </summary>
- public static Encoding Encoding
- {
- get
- {
- return encoding;
- }
- set
- {
- encoding = value;
- }
- }
- public static int NetworkDelay
- {
- get
- {
- Random r = new Random();
- return (r.Next(delay, delay * 2));
- }
- set
- {
- delay = value;
- }
- }
- public static int MaxTry
- {
- get
- {
- return maxTry;
- }
- set
- {
- maxTry = value;
- }
- }
- #endregion
- #region 获取HTML
- /// <summary>
- /// 获取HTML
- /// </summary>
- /// <param name="url">地址</param>
- /// <param name="postData">post 提交的字符串</param>
- /// <param name="isPost">是否是post</param>
- /// <param name="cookieContainer">CookieContainer</param>
- public static string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
- {
- if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
- Thread.Sleep(NetworkDelay);
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- byte[] byteRequest = Encoding.Default.GetBytes(postData);
- httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = isPost ? "POST" : "GET";
- httpWebRequest.ContentLength = byteRequest.Length;
- Stream stream = httpWebRequest.GetRequestStream();
- stream.Write(byteRequest, 0, byteRequest.Length);
- stream.Close();
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- StreamReader streamReader = new StreamReader(responseStream, encoding);
- string html = streamReader.ReadToEnd();
- streamReader.Close();
- responseStream.Close();
- currentTry = 0;
- httpWebRequest.Abort();
- httpWebResponse.Close();
- return html;
- }
- catch (Exception e)
- {
- if (currentTry <= maxTry) GetHtml(url, postData, isPost, cookieContainer);
- currentTry--;
- if (httpWebRequest != null) httpWebRequest.Abort();
- if (httpWebResponse != null) httpWebResponse.Close();
- return string.Empty;
- }
- }
- /// <summary>
- /// 获取HTML
- /// </summary>
- /// <param name="url">地址</param>
- /// <param name="cookieContainer">CookieContainer</param>
- public static string GetHtml(string url, CookieContainer cookieContainer)
- {
- Thread.Sleep(NetworkDelay);
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = "GET";
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- StreamReader streamReader = new StreamReader(responseStream, encoding);
- string html = streamReader.ReadToEnd();
- streamReader.Close();
- responseStream.Close();
- currentTry--;
- httpWebRequest.Abort();
- httpWebResponse.Close();
- return html;
- }
- catch (Exception e)
- {
- if (currentTry <= maxTry) GetHtml(url, cookieContainer);
- currentTry--;
- if (httpWebRequest != null) httpWebRequest.Abort();
- if (httpWebResponse != null) httpWebResponse.Close();
- return string.Empty;
- }
- }
- #endregion
- #region 获取字符流
- /// <summary>
- /// 获取字符流
- /// </summary>
- //---------------------------------------------------------------------------------------------------------------
- // 示例:
- // System.Net.CookieContainer cookie = new System.Net.CookieContainer();
- // Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);
- // picVerify.Image = Image.FromStream(s);
- //---------------------------------------------------------------------------------------------------------------
- /// <param name="url">地址</param>
- /// <param name="cookieContainer">cookieContainer</param>
- public static Stream GetStream(string url, CookieContainer cookieContainer)
- {
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = "GET";
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- currentTry--;
- return responseStream;
- }
- catch (Exception e)
- {
- if (currentTry <= maxTry)
- {
- GetHtml(url, cookieContainer);
- }
- currentTry--;
- if (httpWebRequest != null)
- {
- httpWebRequest.Abort();
- } if (httpWebResponse != null)
- {
- httpWebResponse.Close();
- }
- return null;
- }
- }
- #endregion
- #region 清除HTML标记
- ///<summary>
- ///清除HTML标记
- ///</summary>
- ///<param name="NoHTML">包括HTML的源码</param>
- ///<returns>已经去除后的文字</returns>
- public static string NoHTML(string Htmlstring)
- {
- //删除脚本
- Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
- //删除HTML
- Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
- Htmlstring = regex.Replace(Htmlstring, "");
- Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
- Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
- Htmlstring.Replace("<", "");
- Htmlstring.Replace(">", "");
- Htmlstring.Replace("\r\n", "");
- return Htmlstring;
- }
- #endregion
- #region 匹配页面的链接
- /// <summary>
- /// 获取页面的链接正则
- /// </summary>
- public string GetHref(string HtmlCode)
- {
- string MatchVale = "";
- string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
- foreach (Match m in Regex.Matches(HtmlCode, Reg))
- {
- MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
- }
- return MatchVale;
- }
- #endregion
- #region 匹配页面的图片地址
- /// <summary>
- /// 匹配页面的图片地址
- /// </summary>
- /// <param name="imgHttp">要补充的http://路径信息</param>
- public string GetImgSrc(string HtmlCode, string imgHttp)
- {
- string MatchVale = "";
- string Reg = @"<img.+?>";
- foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
- {
- MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
- }
- return MatchVale;
- }
- /// <summary>
- /// 匹配<img src="" />中的图片路径实际链接
- /// </summary>
- /// <param name="ImgString"><img src="" />字符串</param>
- public string GetImg(string ImgString, string imgHttp)
- {
- string MatchVale = "";
- string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
- foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
- {
- MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
- }
- if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
- return (MatchVale);
- else
- return (imgHttp + MatchVale);
- }
- #endregion
- #region 抓取远程页面内容
- /// <summary>
- /// 以GET方式抓取远程页面内容
- /// </summary>
- public static string Get_Http(string tUrl)
- {
- string strResult;
- try
- {
- HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(tUrl);
- hwr.Timeout = 19600;
- HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
- Stream myStream = hwrs.GetResponseStream();
- StreamReader sr = new StreamReader(myStream, Encoding.Default);
- StringBuilder sb = new StringBuilder();
- while (-1 != sr.Peek())
- {
- sb.Append(sr.ReadLine() + "\r\n");
- }
- strResult = sb.ToString();
- hwrs.Close();
- }
- catch (Exception ee)
- {
- strResult = ee.Message;
- }
- return strResult;
- }
- /// <summary>
- /// 以POST方式抓取远程页面内容
- /// </summary>
- /// <param name="postData">参数列表</param>
- public static string Post_Http(string url, string postData, string encodeType)
- {
- string strResult = null;
- try
- {
- Encoding encoding = Encoding.GetEncoding(encodeType);
- byte[] POST = encoding.GetBytes(postData);
- HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
- myRequest.Method = "POST";
- myRequest.ContentType = "application/x-www-form-urlencoded";
- myRequest.ContentLength = POST.Length;
- Stream newStream = myRequest.GetRequestStream();
- newStream.Write(POST, 0, POST.Length); //设置POST
- newStream.Close();
- HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
- StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
- strResult = reader.ReadToEnd();
- }
- catch (Exception ex)
- {
- strResult = ex.Message;
- }
- return strResult;
- }
- #endregion
- #region 压缩HTML输出
- /// <summary>
- /// 压缩HTML输出
- /// </summary>
- public static string ZipHtml(string Html)
- {
- Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
- Html = Regex.Replace(Html, @"\r\n\s*", "");
- Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
- return Html;
- }
- #endregion
- #region 过滤指定HTML标签
- /// <summary>
- /// 过滤指定HTML标签
- /// </summary>
- /// <param name="s_TextStr">要过滤的字符</param>
- /// <param name="html_Str">a img p div</param>
- public static string DelHtml(string s_TextStr, string html_Str)
- {
- string rStr = "";
- if (!string.IsNullOrEmpty(s_TextStr))
- {
- rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
- rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
- }
- return rStr;
- }
- #endregion
- #region 加载文件块
- /// <summary>
- /// 加载文件块
- /// </summary>
- public static string File(string Path, System.Web.UI.Page p)
- {
- return @p.ResolveUrl(Path);
- }
- #endregion
- #region 加载CSS样式文件
- /// <summary>
- /// 加载CSS样式文件
- /// </summary>
- public static string CSS(string cssPath, System.Web.UI.Page p)
- {
- return @"<link href=""" + p.ResolveUrl(cssPath) + @""" rel=""stylesheet"" type=""text/css"" />" + "\r\n";
- }
- #endregion
- #region 加载JavaScript脚本文件
- /// <summary>
- /// 加载javascript脚本文件
- /// </summary>
- public static string JS(string jsPath, System.Web.UI.Page p)
- {
- return @"<script type=""text/javascript"" src=""" + p.ResolveUrl(jsPath) + @"""></script>" + "\r\n";
- }
- #endregion
- }
- }
|