Jeg har nu fået den til at bruge queue nu men jeg kan ikke gennemskue hvordan jeg får den til at bruge multi thread / backgroundworker min kode er her under ... er der en af jeg der kan vise mig hvordan jeg får det til at bruge multi threadig så jeg kan få oprimeret crawl processen. :)
public Queue<string> queue = new Queue<string>();
public HashSet<string> allSiteUrls = new HashSet<string>();
public void RunCrawl_Click(object sender, EventArgs e)
{
string SendUrl = Url.Value;
if (SendUrl.IndexOf("
http://") != 0)
{
if (SendUrl.IndexOf("
https://") == -1)
{
SendUrl = "
http://" + SendUrl;
}
}
queue.Enqueue(SendUrl);
allSiteUrls.Add(SendUrl);
while (queue.Count > 0)
{
SendUrl = queue.Dequeue();
TheCrawl(SendUrl,0);
}
}
public void TheCrawl(string Url, int Id) {
HTMLCall HTMLCall = new HTMLCall();
if (!string.IsNullOrEmpty(Url)){
if (Url.IndexOf("
http://") != 0)
{
if (Url.IndexOf("
https://") == -1)
{
Url = "
http://" + Url;
}
}
AddCrawlUrl(Url, 1, "Internal");
HtmlWeb hw = new HtmlWeb();
System.Text.Encoding uft8 = System.Text.Encoding.UTF8;
HtmlDocument doc = hw.Load(Url);
hw.AutoDetectEncoding = true;
var ANodes = doc.DocumentNode.SelectNodes("//a[@href]");
if (ANodes != null)
{
foreach (HtmlNode Links in ANodes)
{
HtmlAttribute href = Links.Attributes["href"];
HtmlAttribute title = Links.Attributes["title"];
HtmlAttribute rel = Links.Attributes["rel"];
string aText = Links.InnerText;
string tText = "";
int HttpsOn = 0;
int NoAnchor = 0;
int TitleAtt = 0;
int Noffolow = 0;
int JCall = 0;
int AnchorCall = 0;
int MailCall = 0;
int Internal = 0;
int NoLink = 0;
string NewUrl = href.Value.ToLower();
if (string.IsNullOrEmpty(aText))
{
NoAnchor = 1;
}
if (href.Value.IndexOf("
https://") != -1)
{
HttpsOn = 1;
}
if (title != null)
{
TitleAtt = 1;
tText = title.Value;
}
if (rel != null)
{
if (rel.Value == "nofollow")
Noffolow = 1;
}
if (href.Value.ToLower().IndexOf("javascript") != -1)
{
JCall = 1;
}
if (href.Value.ToLower().IndexOf("@") != -1)
{
MailCall = 1;
}
if (href.Value.ToLower().IndexOf("#") == 0)
{
AnchorCall = 1;
}
if (href.Value.ToLower().IndexOf(".") == -1)
{
NoLink = 1;
}
if (href.Value.ToLower().IndexOf("
http://") != 0 && href.Value.ToLower().IndexOf("/") == 0)
{
Internal = 1;
NewUrl = "
http://" + HTMLCall.ExtractDomainNameFromURL(Url).ToLower() + href.Value.ToLower();
}
if (href.Value.ToLower().IndexOf(HTMLCall.ExtractDomainNameFromURL(Url)) != -1)
{
Internal = 1;
}
string IsInternal = "External";
if (Internal == 1)
{
IsInternal = "Internal";
}
else if (JCall == 1)
{
IsInternal = "JavascriptCall";
}
else if (MailCall == 1)
{
IsInternal = "Email";
}
else if (NoLink == 1)
{
IsInternal = "NoLink";
}
if (NewUrl.IndexOf("
http://") == -1)
{
if (NewUrl.IndexOf("
https://") != -1)
{
NewUrl = "
http://" + NewUrl;
}
}
if (!allSiteUrls.Contains(NewUrl))
{
allSiteUrls.Add(NewUrl);
if (IsInternal == "Internal")
{
queue.Enqueue(NewUrl);
}
else
{
public Queue<string> queue = new Queue<string>();
public HashSet<string> allSiteUrls = new HashSet<string>();
public void RunCrawl_Click(object sender, EventArgs e)
{
string SendUrl = Url.Value;
if (SendUrl.IndexOf("
http://") != 0)
{
if (SendUrl.IndexOf("
https://") == -1)
{
SendUrl = "
http://" + SendUrl;
}
}
queue.Enqueue(SendUrl);
allSiteUrls.Add(SendUrl);
while (queue.Count > 0)
{
SendUrl = queue.Dequeue();
TheCrawl(SendUrl,0);
}
}
public void TheCrawl(string Url, int Id) {
HTMLCall HTMLCall = new HTMLCall();
if (!string.IsNullOrEmpty(Url)){
if (Url.IndexOf("
http://") != 0)
{
if (Url.IndexOf("
https://") == -1)
{
Url = "
http://" + Url;
}
}
AddCrawlUrl(Url, 1, "Internal");
HtmlWeb hw = new HtmlWeb();
System.Text.Encoding uft8 = System.Text.Encoding.UTF8;
HtmlDocument doc = hw.Load(Url);
hw.AutoDetectEncoding = true;
var ANodes = doc.DocumentNode.SelectNodes("//a[@href]");
if (ANodes != null)
{
foreach (HtmlNode Links in ANodes)
{
HtmlAttribute href = Links.Attributes["href"];
HtmlAttribute title = Links.Attributes["title"];
HtmlAttribute rel = Links.Attributes["rel"];
string aText = Links.InnerText;
string tText = "";
int HttpsOn = 0;
int NoAnchor = 0;
int TitleAtt = 0;
int Noffolow = 0;
int JCall = 0;
int AnchorCall = 0;
int MailCall = 0;
int Internal = 0;
int NoLink = 0;
string NewUrl = href.Value.ToLower();
if (string.IsNullOrEmpty(aText))
{
NoAnchor = 1;
}
if (href.Value.IndexOf("
https://") != -1)
{
HttpsOn = 1;
}
if (title != null)
{
TitleAtt = 1;
tText = title.Value;
}
if (rel != null)
{
if (rel.Value == "nofollow")
Noffolow = 1;
}
if (href.Value.ToLower().IndexOf("javascript") != -1)
{
JCall = 1;
}
if (href.Value.ToLower().IndexOf("@") != -1)
{
MailCall = 1;
}
if (href.Value.ToLower().IndexOf("#") == 0)
{
AnchorCall = 1;
}
if (href.Value.ToLower().IndexOf(".") == -1)
{
NoLink = 1;
}
if (href.Value.ToLower().IndexOf("
http://") != 0 && href.Value.ToLower().IndexOf("/") == 0)
{
Internal = 1;
NewUrl = "
http://" + HTMLCall.ExtractDomainNameFromURL(Url).ToLower() + href.Value.ToLower();
}
if (href.Value.ToLower().IndexOf(HTMLCall.ExtractDomainNameFromURL(Url)) != -1)
{
Internal = 1;
}
string IsInternal = "External";
if (Internal == 1)
{
IsInternal = "Internal";
}
else if (JCall == 1)
{
IsInternal = "JavascriptCall";
}
else if (MailCall == 1)
{
IsInternal = "Email";
}
else if (NoLink == 1)
{
IsInternal = "NoLink";
}
if (NewUrl.IndexOf("
http://") == -1)
{
if (NewUrl.IndexOf("
https://") != -1)
{
NewUrl = "
http://" + NewUrl;
}
}
if (!allSiteUrls.Contains(NewUrl))
{
allSiteUrls.Add(NewUrl);
if (IsInternal == "Internal")
{
queue.Enqueue(NewUrl);
}
else
{
AddCrawlUrl(NewUrl, 1, IsInternal);
}
}
}
}
}
}
}
}
}
}
}
}