H漫国产剧情在线,九七电影院理论片在线观看,亚洲欧美自拍制服另类图区,8x资源导航,jijzzizz老师出水喷水多毛,性xxxx18免费观看视频,玩小处雏女嫩苞在线视频,最新视频网站在线观看色多多,亚洲涩图国产,国产精品99久久不卡

C#采集網(wǎng)站頁面的所有圖片與標(biāo)題內(nèi)容

當(dāng)前位置：點(diǎn)晴教程→知識(shí)管理交流 →『技術(shù)文檔交流』

admin

2017年4月5日 0:40 本文熱度 6219

今天因?yàn)楣ぷ餍枰乙?net來寫一個(gè)采集功能出一，實(shí)現(xiàn)要求是：采集指定網(wǎng)站頁面中所有圖片并且可以過濾不符合要求的圖片并保存到本地，同時(shí)還需要把標(biāo)題與內(nèi)容采集下來。

原理很簡單：抓取網(wǎng)頁源碼=>正則匹配圖片地址=>使用C#提供的方法下載圖片

如果是縮略圖形式，再獲取A標(biāo)簽的HREF地址，過濾出圖片地址即可。

代碼如下

復(fù)制代碼

static string GetImgUrl(string html)
        {
            //string regex = @"<img.*?src=[""''](.+?)[""''].*?>";//這個(gè)不可以匹配沒有單引號(hào)和雙引號(hào)的情況
            string regex = @"<img.*?src=[''""]?([^""''s]+)";
            //string regex=@"(?is)<img[sS]*?src=[''""]?([^''""s]+)";
            StringBuilder builder = new StringBuilder();
            MatchCollection mc = Regex.Matches(html, regex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            for (int i = 0; i < mc.Count; i++)
            {
                builder.Append(mc[i].Groups[1].Value);
                builder.Append("@");
            }
            return builder.ToString();
        }

        static void Main(string[] args)
        {
            string html = GetHtmlInfo("www.baidu.com", 15000, Encoding.GetEncoding("GBK"));
            Console.WriteLine(html);

            //html = "<dsd<img alt=''www '' src=0.jpg />ssd<img alt=''dff '' src=''1.jpg'' title=''AA''><img src=''http://filesimg.111cn.net/2013/11/07/20131107021918325.jpg''>dsds<img src="3.png" /> fd<img    src=''4.jpg''><img title=''dsds'' src=''5.jpg''>";
            //Console.WriteLine(GetImgUrl(html));

            string[] imgarr = GetImgUrl(html).TrimEnd(''@'').Split(''@'');//去掉最后一個(gè)@符號(hào)，再分割成數(shù)組
            foreach (string str in imgarr)
            {
                SaveImg(str);
                Console.WriteLine(str);
            }
            Console.Read();

        }
        /// <summary>
        /// 下載指定頁面所有圖片
        /// </summary>
        /// <param name="imgurl"></param>
        static void SaveImg(string imgurl)
        {
            string imgName = imgurl.Substring(imgurl.LastIndexOf(''/''));//獲取原來圖片名稱
            WebRequest request = WebRequest.Create(imgurl);
            WebResponse response = request.GetResponse();
            Stream reader = response.GetResponseStream();
            if (!Directory.Exists(@"D:tony")) { Directory.CreateDirectory(@"D:Tony"); }
            FileStream writer = new FileStream(@"D:Tony" + imgName, FileMode.OpenOrCreate, FileAccess.Write);
            byte[] buff = new byte[512];
            int c = 0;
            while ((c = reader.Read(buff, 0, buff.Length)) > 0)
            {
                writer.Write(buff, 0, c);
            }
//此處可以過濾圖片尺寸
           using (Image img = Image.FromFile(@"D:Tony" + imgName + ".jpg"))
            {
                if (img.Size.Width > 100)
                {
                    Console.WriteLine(img.Size);
                }
            }
            writer.Close();
            reader.Close();
            response.Close();
        }

/// <summary>
        /// 獲取頁面的HTML信息
        /// </summary>
        /// <param name="url">頁面地址</param>
        /// <param name="timeout">超時(shí)時(shí)間，單位：ms</param>
        /// <param name="EnCodeType">編碼</param>
        /// <returns></returns>
        static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
        {
            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
            string result = "";
            System.IO.StreamReader reader = null;
            string temp = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
                request.Timeout = timeout;
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)";
                request.Accept = "*/*";
                request.KeepAlive = true;
                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回來自Internet的響應(yīng)
                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    StringBuilder builder = new StringBuilder();
                    Stream stream = response.GetResponseStream();
                    reader = new StreamReader(stream, EnCodeType);
                    string tmp = "";
                    while ((temp = reader.ReadLine()) != null)
                    {
                        builder.Append(temp);
                        tmp = builder.ToString();
                        builder.Append("rn");
                    }
                    result = builder.ToString();
                    return result;
                }
                return string.Empty;
            }
            catch (Exception ex)
            {
                return ex.Message;
            }
            finally { if (reader != null) { reader.Close(); } }
        }

C#快速獲取網(wǎng)頁頁面的標(biāo)題

代碼如下

復(fù)制代碼

using System.Text.RegularExpressions;
static string GetTitle(string html)
        {
            //string regex = @"(<title>)([sS]*)(</title>)";
            string regex = @"(?<=<title.*>)([sS]*)(?=</title>)";
            //正向預(yù)搜索與反向預(yù)搜索：http://www.rczjp.cn/HTML/120709/20120409090416.html
            Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
            return reg.Match(html).Value.Trim();
        }

        static void Main(string[] args)
        {
            string html = GetHtmlInfo("www.rczjp.cn", 5000, Encoding.UTF8);
            Console.WriteLine(html);
            Console.WriteLine(GetTitle(html));

            Console.Read();

        }

        /// <summary>
        /// 獲取頁面的HTML信息，到標(biāo)題（</title>）位置結(jié)束
        /// </summary>
        /// <param name="url">頁面地址</param>
        /// <param name="timeout">超時(shí)時(shí)間，單位：ms</param>
        /// <param name="EnCodeType">編碼</param>
        /// <returns></returns>
        static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
        {
            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
            string result = "";
            System.IO.StreamReader reader = null;
            string temp = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
                request.Timeout = timeout;
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)";
                request.Accept = "*/*";
                request.KeepAlive = true;
                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回來自Internet的響應(yīng)
                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    StringBuilder builder = new StringBuilder();
                    Stream stream = response.GetResponseStream();
                    reader = new StreamReader(stream, EnCodeType);
                    string tmp = "";
                    while ((temp = reader.ReadLine()) != null)
                    {
                        builder.Append(temp);
                        tmp = builder.ToString();
                        if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是讀取整行，所以有時(shí)在它后面的很多字符串也會(huì)讀取
                        Console.WriteLine(tmp.IndexOf("</title>"));
                        builder.Append("rn");
                    }
                    result = builder.ToString();
                    return result;
                }
                return string.Empty;
            }
            catch (Exception ex)
            {
                return ex.Message;
            }
            finally { if (reader != null) { reader.Close(); } }
        }

該文章在 2017/4/5 0:40:56 編輯過

關(guān)鍵字查詢

相關(guān)文章

正在查詢...

點(diǎn)晴ERP是一款針對(duì)中小制造業(yè)的專業(yè)生產(chǎn)管理軟件系統(tǒng),系統(tǒng)成熟度和易用性得到了國內(nèi)大量中小企業(yè)的青睞。

點(diǎn)晴PMS碼頭管理系統(tǒng)主要針對(duì)港口碼頭集裝箱與散貨日常運(yùn)作、調(diào)度、堆場(chǎng)、車隊(duì)、財(cái)務(wù)費(fèi)用、相關(guān)報(bào)表等業(yè)務(wù)管理，結(jié)合碼頭的業(yè)務(wù)特點(diǎn)，圍繞調(diào)度、堆場(chǎng)作業(yè)而開發(fā)的。集技術(shù)的先進(jìn)性、管理的有效性于一體，是物流碼頭及其他港口類企業(yè)的高效ERP管理信息系統(tǒng)。

點(diǎn)晴WMS倉儲(chǔ)管理系統(tǒng)提供了貨物產(chǎn)品管理,銷售管理,采購管理,倉儲(chǔ)管理,倉庫管理,保質(zhì)期管理,貨位管理,庫位管理,生產(chǎn)管理,WMS管理系統(tǒng),標(biāo)簽打印,條形碼,二維碼管理,批號(hào)管理軟件。

點(diǎn)晴免費(fèi)OA是一款軟件和通用服務(wù)都免費(fèi)，不限功能、不限時(shí)間、不限用戶的免費(fèi)OA協(xié)同辦公管理系統(tǒng)。