Det ser ud til at virke udmærket! Det var sidste element til at få min webcrawler til at virke:
#include <iostream>
#include <string>
#include <windows.h>
#include <cstring>
using namespace std;
CRITICAL_SECTION cs;
DWORD __stdcall crawler(void *p);
//Funktionen som henter html:
string get(char *host, char *path){
int sock;
int tmp;
char *input;
char cmd[512],resp[51200];
struct sockaddr local;
struct sockaddr remote;
struct hostent *hostinfo;
WSADATA WSAData;
WSAStartup(0x0101,&WSAData);
//Laver socket
if((sock=socket(AF_INET,SOCK_STREAM,0))<0) {
printf("Error\n");
}
local.sa_family = AF_INET;
memset(local.sa_data,0,sizeof(local.sa_data));
if(bind(sock,&local,sizeof(local))<0){
printf("Error\n");
}
//Lookup host
hostinfo=gethostbyname(host);
if(!hostinfo){
printf("Error\n");
}
//Connect til host:
remote.sa_family=hostinfo->h_addrtype;
memcpy(remote.sa_data+2,hostinfo->h_addr_list[0],hostinfo->h_length);
*((short *)remote.sa_data)=80;
tmp=remote.sa_data[0];
remote.sa_data[0]=remote.sa_data[1];
remote.sa_data[1]=tmp;
if(connect(sock,&remote,sizeof(remote))!=0) {
printf("Error\n");
}
//Send request:
sprintf(cmd,"GET %s HTTP/1.1\r\nHost: %s\r\n\r\n",path,host);
if(send(sock,cmd,strlen(cmd),0)<0) {
printf("Error sending GET request\n");
}
//Læs respons
int ix=0;
int len;
while ((len=recv(sock,resp+ix,sizeof(resp)-ix-1,0))>0) {
ix = ix + len;
}
resp[ix]='\0';
//Close socket
closesocket(sock);
WSACleanup();
return resp;
}
//Funktionen der finder alle links i html dokumentet
void findlink(string buffer){
int s = 2;
string link[1000];
for (int i = 0; i!=buffer.length();i++){
if(buffer.at(i)=='h' && buffer.at(i+1)=='r' && buffer.at(i+2)=='e' && buffer.at(i+3)=='f'){
for(int y = 0; y!=300; y++){
if(buffer.at(i+y+6)!='"'){
link[s].push_back(buffer.at(i+y+6));
}else{
s++;
break;
}
}
}
}
for(int m = 0;m!=1000;m++){
if(link[m]!=""){
cout << "#crawling link: "<<link[m] << "\n";
//Starter ny tråd til at crawle nyt link
CreateThread(0, 0, crawler, (void*)&link[m], 0, 0);
}
}
}
//Selve crawleren, som deler links op i host og path:
DWORD __stdcall crawler(void *p){
string link = *(string*)p;
EnterCriticalSection(&cs);
string host;
string path;
char host2[100];
char path2[100];
int offset = strlen("
http://");
cout << "#Thread started\n";
if(link.find("http")<link.length()){
for(int i = offset; i!=link.length();i++){
if(link.at(i)=='/'){
path=link.substr(i,link.length());
host=link.substr(offset,i-offset);
break;
}
}
strcpy(host2,host.c_str());
strcpy(path2,path.c_str());
if(strlen(host2)<1){
cout << "#Reached deadend\n";
}else{
findlink(get(host2,path2));
}
}
LeaveCriticalSection(&cs);
}
int main(){
string start = "
http://www.google.com/test"; InitializeCriticalSection(&cs);
CreateThread(0, 0, crawler, &start, 0, 0);
cin.get();
return 0;
}
- Men, bertel var først med svaret, men arne havde svaret på mit yderliggående spørgsmål, i må begge ligge et svar :)
Mange tak for hjælpen :)