Linux下C抓取网页
一直想做个爬虫,却不知道怎么开始,在网上找了个C下载网页的程序,保存下,下次可以直接从这看。#include#include#include#include#include#include#include#includechar* host = "www.hao123.com";int port = 80;int main(void){char b
·
一直想做个爬虫,却不知道怎么开始,在网上找了个C下载网页的程序,保存下,下次可以直接从这看。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
char* host = "www.hao123.com";
int port = 80;
int main(void)
{
char buffer[512];
int isock;
struct sockaddr_in pin;
struct hostent * remoteHost;
char message[512];
int done = 0;
int chars = 0;
int l = 0;
if( (remoteHost = gethostbyname(host)) == 0 )
{
printf("Error resolving host\n");
exit(1);
}
bzero(message,sizeof(message));
bzero(&pin,sizeof(pin));
pin.sin_family = AF_INET;
pin.sin_port = htons(port);
pin.sin_addr.s_addr = ( (struct in_addr *)(remoteHost->h_addr) )->s_addr;
if( (isock = socket(AF_INET, SOCK_STREAM, 0)) == -1)
{
printf("Error opening socket!\n");
exit(1);
}
sprintf(message, "GET / HTTP/1.1\r\n");
strcat(message, "Host:www.hao123.com\r\n");
strcat(message, "Accept: */*\r\n");
strcat(message, "User-Agent: Mozilla/4.0(compatible)\r\n");
strcat(message, "connection:Keep-Alive\r\n");
strcat(message, "\r\n\r\n");
printf("%s",message);
if( connect(isock, (const sockaddr*) &pin, sizeof(pin)) == -1 )
{
printf("Error connecting to socket\n");
exit(1);
}
if( send(isock, message, strlen(message), 0) == -1)
{
printf("Error in send\n");
exit(1);
}
struct timeval timeout = {1,0}; //设置超时时间1秒,0代表秒后面的微秒数,左边这个就是1秒0微秒
//设置接收超时
setsockopt(isock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(struct timeval));
while(done == 0)
{
l = recv(isock, buffer, 1, 0);
if( l < 0 )
done = 1;
switch(*buffer)
{
case '\r':
break;
case '\n':
if(chars == 0)
done = 1;
chars = 0;
break;
default:
chars++;
break;
}
printf("%c",*buffer);
}
do
{
l = recv(isock, buffer, sizeof(buffer) - 1, 0);
if( l < 0 )
break;
*(buffer + l) = 0;
fputs(buffer, stdout);
}while( l > 0 );
close(isock);
return 0;
}
更多推荐
已为社区贡献2条内容
所有评论(0)