This is a newer version (especially the source code) of the former blog entry at spin.
How does it work?
- use libxml to parse html
- scan css files for image urls
- sorry, but no javascript :(
- correct relative and absolute urls
- download all these files
Le sourcecode:
#include
#include
#include
#include
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include <libxml/xmlerror.h>
#include <curl/curl.h>
#include
#include "getpage.h"
#define FILELENGTH 150
#define CURL_TIMEOUT_SEC 240
#define SELECT_TIMEOUT_SEC 10
#define MAX_P_FILE_DOWNLOADS 10
#define DEBUG
static char ALPHABET[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890";
/*
Get out:
* TODO: frames
* TODO: javascript file riddles ;)
* TODO: wrong content-length -> reload
* TODO: same file in 2 different CSS files
*/
enum FILETYPE
{
IMG = 0x1,
STYLE = 0x2,
SCRIPT = 0x4,
IFRAME = 0x8,
FRAME = 0x10,
PDF = 0x20,
CSS_IMG = 0x40,
NONE = 0x80
};
struct _replace_info
{
char *begin;
char end;
void (*userfunction) (void*, char*, int, bool);
void *userdata;
char *buffer;
int begin_progress;
int begin_length;
bool inside_gap;
int status;
};
struct _site_files
{
char *url;
char *url2;
char *filename;
enum FILETYPE ft;
struct _site_files *next;
FILE *fp;
struct _replace_info *ri;
short nth_url;
#ifdef DEBUG
int id;
bool done;
#endif
};
struct _site_userdata
{
//void (*site_function)(void*, const char*, ...);
void (*site_function)(void*, const char*, va_list);
void *userdata;
struct _site_files *sf;
char *_base_url;
bool _utf8_meta_set;
CURL *_mhnd;
CURL *_hnd;
};
struct _css_filter_userdata
{
struct _site_userdata *su;
char *url;
};
struct _css_filter_save_userdata
{
struct _site_userdata *su;
FILE *fp;
char *filename;
char *url;
char *_css_base_url;
};
static char *_filetype_string(enum FILETYPE ft)
{
char *txt;
switch(ft)
{
case IMG: txt = "IMG"; break;
case CSS_IMG: txt = "CSS_IMG"; break;
case STYLE: txt = "STYLE"; break;
case SCRIPT: txt = "SCRIPT"; break;
case IFRAME: txt = "IFRAME"; break;
case FRAME: txt = "FRAME"; break;
case PDF: txt = "PDF"; break;
case NONE: txt = "OTHER"; break;
default: txt = "DEFAULT"; break;
}
return txt;
}
static void _user_function(struct _site_userdata *su, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
su->site_function(su->userdata, fmt, ap);
va_end(ap);
}
static char *__join_together(char *a, char *b, int len_b)
{
int len_a = 0;
int i;
char *new;
if (a != NULL)
len_a += strlen(a);
new = realloc(a, len_b+1+len_a);
if (new != NULL)
{
for (i = 0; i < len_b; i++) new[i+len_a] = b[i]; new[len_a+len_b] = '\0'; } return new; } // return true if inside gap -> 1
// return false if outside gap -> -1
static int inline replace_step(struct _replace_info *ri, char txt)
{
if (txt == ri->begin[ri->begin_progress])
ri->begin_progress++;
else
ri->begin_progress = 0;
if (ri->begin_progress == ri->begin_length)
{
ri->begin_progress = 0;
ri->inside_gap = true;
return -1;
}
if (ri->inside_gap)
{
if (txt == ri->end)
{
ri->inside_gap = false;
return -1;
}
else
return 1;
}
return -1;
}
static void replace(struct _replace_info *ri, char *txt, int length)
{
int i;
int offset = 0;
int status_temp = -1;
for (i = 0; i < length; i++) { status_temp = replace_step(ri, txt[i]); if (ri->status != status_temp)
{
if (ri->buffer != NULL)
{
if (ri->status == 1)
{
ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), true);
}
else if (ri->status == -1)
{
ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false);
}
free(ri->buffer);
ri->buffer = NULL;
}
if (ri->status == 1)
{
ri->userfunction(ri->userdata, txt+offset, i-offset, true);
}
else if (ri->status == -1)
{
ri->userfunction(ri->userdata, txt+offset, i-offset, false);
}
offset = i;
}
ri->status = status_temp;
}
if (offset != length)
{
if (status_temp == 1 || status_temp == -1)
{
ri->userfunction(ri->userdata, txt+offset, i-offset, ri->status == 1 ? true : false);
}
else
{
if (txt[length-1] == '\0')
{
if (ri->buffer != NULL)
{
ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false);
}
free(ri->buffer);
ri->buffer = NULL;
ri->userfunction(ri->userdata, txt+offset, length-offset, false);
}
else
ri->buffer = __join_together(ri->buffer, txt+offset, length-offset);
}
}
}
static void _set_chnd(CURL *hnd, char *url, void *cbfunction, void *userdata)
{
curl_easy_setopt(hnd, CURLOPT_INFILESIZE_LARGE, (curl_off_t)-1);
curl_easy_setopt(hnd, CURLOPT_URL, url);
curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1);
curl_easy_setopt(hnd, CURLOPT_FAILONERROR, 0);
curl_easy_setopt(hnd, CURLOPT_USERAGENT, "libmessage - btwotch+libmessage@gmail.com");
//curl_easy_setopt(hnd, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.62 Safari/534.3");
curl_easy_setopt(hnd, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)0);
curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50);
curl_easy_setopt(hnd, CURLOPT_SSLVERSION, 0);
curl_easy_setopt(hnd, CURLOPT_TIMECONDITION, 0);
curl_easy_setopt(hnd, CURLOPT_TIMEVALUE, 0);
curl_easy_setopt(hnd, CURLOPT_CUSTOMREQUEST, NULL);
curl_easy_setopt(hnd, CURLOPT_CONNECTTIMEOUT, CURL_TIMEOUT_SEC);
curl_easy_setopt(hnd, CURLOPT_TIMEOUT, CURL_TIMEOUT_SEC);
curl_easy_setopt(hnd, CURLOPT_HTTPAUTH, 1);
curl_easy_setopt(hnd, CURLOPT_ENCODING, NULL);
curl_easy_setopt(hnd, CURLOPT_IPRESOLVE, 0);
curl_easy_setopt(hnd, CURLOPT_IGNORE_CONTENT_LENGTH, 0);
curl_easy_setopt(hnd, CURLOPT_POSTREDIR, 0);
curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, cbfunction);
curl_easy_setopt(hnd, CURLOPT_WRITEDATA, userdata);
curl_easy_setopt(hnd, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(hnd, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(hnd, CURLOPT_AUTOREFERER, 1);
curl_easy_setopt(hnd, CURLOPT_ENCODING, "deflate");
curl_easy_setopt(hnd, CURLOPT_SSL_VERIFYHOST, 1); // TODO
}
static void _filename_gen(struct _site_files *first_sf, char *filename)
{
int i;
bool name_double;
struct _site_files *sf;
do
{
name_double = false;
srand(1337^filename[0]);
for (i = FILELENGTH/2; i < FILELENGTH; i++) filename[i] = ALPHABET[rand()% (strlen(ALPHABET)-1)]; filename[FILELENGTH-1] = '\0'; sf = first_sf; while (sf != NULL && sf->filename != NULL)
{
if (!strcasecmp(sf->filename, filename))
name_double = true;
sf = sf->next;
}
} while (name_double);
}
static char* _shrink_url(char *rurl) // remove apostrophes etc.
{
int length;
while (rurl[0] != '\0' && rurl[0] == ' ')
rurl++;
length = strlen(rurl);
for (int i = 0; i < length/2; i++) if (rurl[i] == '\'' || rurl[i] == '\"') { if (rurl[i] == rurl[length-i-1]) { rurl[length-i-1] = '\0'; rurl++; } } else break; return rurl; } static void _crap_sites_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl) { int abs_urllen; if (!strncasecmp(rurl, "//", 2)) // gmx-hack { abs_urllen = 5+strlen(rurl)+1; *abs_url = malloc(abs_urllen*sizeof(char)); snprintf(*abs_url, abs_urllen, "http:%s", rurl); } } static void _relative_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl, short nth_url) { int abs_urllen; int domain_end = 0, i; int base_len = 0; char *burl = NULL; if (strncasecmp(rurl, "http://", 7) && strncasecmp(rurl, "https://", 8) && strncasecmp(rurl, "ftp://", 6) && strncasecmp(rurl, "file://", 7) && strncasecmp(rurl, "about:", 6) && strncasecmp(rurl, "javascript:", 11)) { if (static_burl != NULL) burl = static_burl; else if (curl_easy_getinfo(hnd, CURLINFO_EFFECTIVE_URL, &burl) != CURLE_OK) { fprintf(stderr, "CURLINFO_EFFECTIVE_URL failed\n"); exit(1); } if (!strncasecmp(burl, "http://", 7)) domain_end = 7; else if (!strncasecmp(burl, "https://", 8)) domain_end = 8; else if (!strncasecmp(burl, "ftp://", 6)) domain_end = 6; else if (!strncasecmp(burl, "file://", 6)) domain_end = strlen(burl); if (nth_url > 0)
for (i = domain_end+1; i < strlen(burl); i++)
{
if (burl[i] == '/')
{
if (i < strlen(burl)-1) if (burl[i+1] == '/') continue; if (nth_url == 1) { base_len = i; break; } else nth_url--; } } if (nth_url == -1) for (i = strlen(burl); i > domain_end; i--)
if (burl[i] == '/')
{
base_len = i;
break;
}
if (base_len == 0)
base_len = strlen(burl);
abs_urllen = strlen(rurl) + strlen(burl) + 2;
*abs_url = malloc(sizeof(char)*abs_urllen);
snprintf(*abs_url, abs_urllen, "%.*s/%s", base_len, burl, rurl);
}
}
static char* _absolute_url(CURL *hnd, char *rurl, char *static_burl, short nth_url)
{
char *abs_url = NULL;
if (nth_url == 1)
{
_crap_sites_aburl(&abs_url, hnd, rurl, static_burl);
if (abs_url != NULL)
return abs_url;
}
_relative_aburl(&abs_url, hnd, rurl, static_burl, nth_url);
if (abs_url != NULL)
return abs_url;
if (nth_url == 1)
{
int abs_urllen = strlen(rurl)+1;
abs_url = malloc(abs_urllen+1);
strncpy(abs_url, rurl, abs_urllen);
//abs_url = strdup(rurl);
}
return abs_url;
}
static char *_site_files_add(struct _site_userdata *su, char *url, char *base_url, enum FILETYPE ft)
{
struct _site_files *sf = su->sf;
char *newurl, *newfilename, *sec_url;
int i;
int url_length;
int filename_length;
if (url == NULL)
return NULL;
url_length = strlen(url)+1;
//printf("%s %s\n", su->_base_url, base_url);
url = _shrink_url(url);
newurl = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, 1);
sec_url = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, -1);
if (sf != NULL)
{
if (!strcmp(sf->url, newurl))
{
free(newurl);
free(sec_url);
return sf->filename;
}
while (sf->next != NULL)
{
sf = sf->next;
if (!strcmp(sf->url, newurl))
{
free(newurl);
free(sec_url);
return sf->filename;
}
}
sf->next = malloc(sizeof(struct _site_files));
sf = sf->next;
sf->ri = NULL;
sf->next = NULL;
}
else
{
sf = malloc(sizeof(struct _site_files));
sf->ri = NULL;
su->sf = sf;
sf->next = NULL;
}
sf->filename = NULL;
sf->ft = ft;
sf->url = newurl;
sf->url2 = sec_url;
filename_length = strlen(newurl)+1;
if (filename_length > FILELENGTH)
filename_length = FILELENGTH;
newfilename = malloc(sizeof(char)*(filename_length));
strncpy(newfilename, sf->url, filename_length);
if (filename_length == FILELENGTH)
_filename_gen(sf, newfilename);
sf->filename = newfilename;
for (i = 0; i < strlen(sf->filename); i++)
{
if (sf->filename[i] == '/')
sf->filename[i] = '_';
else if (sf->filename[i] == '?')
sf->filename[i] = '_';
else if (sf->filename[i] == '#')
sf->filename[i] = '_';
else if (sf->filename[i] == '@')
sf->filename[i] = '_';
else if (sf->filename[i] == '%')
sf->filename[i] = '_';
else if (sf->filename[i] == ':')
sf->filename[i] = '_';
else if (sf->filename[i] == ' ')
sf->filename[i] = '_';
}
return sf->filename;
}
void _save_file_css_save(void *userdata, char *gap, int length, bool gapped)
{
struct _css_filter_save_userdata *cfsu = (struct _css_filter_save_userdata*) userdata;
char *filename;
if (cfsu->fp == NULL)
{ // first call of this func.
cfsu->fp = fopen(cfsu->filename, "w");
cfsu->url = NULL;
}
if (gapped)
cfsu->url = __join_together(cfsu->url, gap, length);
else if (!gapped && cfsu->url != NULL)
{
filename = _site_files_add(cfsu->su, cfsu->url, cfsu->_css_base_url, CSS_IMG);
fprintf(cfsu->fp, "%s", filename);
free(cfsu->url);
cfsu->url = NULL;
fprintf(cfsu->fp, "%.*s", length, gap);
}
else
fprintf(cfsu->fp, "%.*s", length, gap);
}
size_t _save_file_css(char *txt, size_t size, size_t nmemb, struct _site_files *sf) // feed the replacer!
{
if (size == 0 && nmemb == 0 && sf->fp != NULL)
{
fclose(sf->fp);
}
else if (sf->fp == NULL)
sf->fp=fopen(sf->filename, "w");
if (sf->fp == NULL)
{
perror("fopen");
return 0;
}
replace(sf->ri, txt, size*nmemb);
return size*nmemb;
}
size_t _save_file(char *txt, size_t size, size_t nmemb, struct _site_files *sf)
{
int i;
if (size == 0 && nmemb == 0 && sf->fp != NULL)
{
fclose(sf->fp);
}
else if (sf->fp == NULL)
sf->fp=fopen(sf->filename, "w");
if (sf->fp == NULL)
{
perror("fopen");
return 0;
}
for (i = 0; i < size*nmemb; i++) fputc(txt[i], sf->fp);
return size*nmemb;
}
static void _set_css_ri(struct _replace_info *ri, void *userdata, void *userfunction)
{
ri->begin = "url(";
ri->end = ')';
ri->userfunction = userfunction;
ri->userdata = userdata;
ri->buffer = NULL;
ri->begin_progress = 0;
ri->begin_length = 4;
ri->inside_gap = false;
ri->status = -1;
}
static int _add_download_files(struct _site_files *sf, struct _site_userdata *su, CURL *mhnd, short nth_url)
{
struct _css_filter_save_userdata *cfsu;
CURL *hnd;
sf->fp = NULL;
sf->nth_url = nth_url;
#ifdef DEBUG
static int id;
sf->id = id++;
fprintf(stderr, "Download (id: %d, %s) %s -> %s\n", id, _filetype_string(sf->ft), sf->url, sf->filename);
#endif
if (sf->ft == STYLE)
{
sf->ri = malloc(sizeof(struct _replace_info));
cfsu = malloc(sizeof(struct _css_filter_save_userdata));
cfsu->_css_base_url = sf->url;
cfsu->fp = NULL;
cfsu->filename = sf->filename;
cfsu->su = su;
_set_css_ri(sf->ri, cfsu, _save_file_css_save);
if (nth_url == 1)
{
hnd = curl_easy_init();
_set_chnd(hnd, sf->url, _save_file_css, sf);
}
else if (nth_url == 2)
{
if (sf->url2 != NULL)
{
hnd = curl_easy_init();
_set_chnd(hnd, sf->url2, _save_file_css, sf);
}
else
return -1;
}
else
return -1;
}
else
{
if (nth_url == 1)
{
hnd = curl_easy_init();
_set_chnd(hnd, sf->url, _save_file, sf);
}
else if (nth_url == 2)
{
if (sf->url2 != NULL)
{
hnd = curl_easy_init();
_set_chnd(hnd, sf->url2, _save_file, sf);
}
else
return -1;
}
else
return -1;
}
curl_easy_setopt(hnd, CURLOPT_PRIVATE, sf);
curl_multi_add_handle(mhnd, hnd);
#ifdef DEBUG
sf->done = false;
#endif
return 1;
}
static void _download_files(struct _site_userdata *su)
{
int handles = 1, msgs_in_queue, maxfd;
int iteration = 0;
int downloads = 0; // current downloads
char *curlinfo_private;
CURL *mhnd;
CURLMsg *cmsg;
struct _site_files *first_sf = su->sf;
struct _site_files *sf = first_sf;
struct _site_files *tmp_sf;
struct timeval timeout;
fd_set fdread, fdwrite, fderr;
char *burl;
#ifdef DEBUG
char *ip;
#endif
long response_code;
if (sf == NULL)
return;
mhnd = curl_multi_init();
if (_add_download_files(sf, su, mhnd, 1) > 0)
downloads++;
sf = sf->next;
while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0);
do
{
iteration++;
FD_ZERO(&fdread);
FD_ZERO(&fdwrite);
FD_ZERO(&fderr);
timeout.tv_sec = SELECT_TIMEOUT_SEC;
timeout.tv_usec = 0;
curl_multi_fdset(mhnd, &fdread, &fdwrite, &fderr, &maxfd);
switch(select(maxfd+1, &fdread, &fdwrite, &fderr, &timeout))
{
case -1:
#ifdef DEBUG
fprintf(stderr, "select bad :(\n");
perror("!!! select failed ");
while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
{
if (cmsg->data.result != 0)
{
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIMARY_IP, &ip);
fprintf(stderr, "ip: %s url: %s result: %d", ip, burl, cmsg->data.result);
if (cmsg->data.result == 7)
fprintf(stderr, " (couldn't connect)");
fprintf(stderr, "\n");
}
}
fprintf(stderr, "-----------\n");
#endif
default:
while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
if (cmsg->msg == CURLMSG_DONE)
{
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private);
tmp_sf = (struct _site_files*)curlinfo_private;
if (tmp_sf->ft == CSS_IMG)
_save_file_css(NULL, 0, 0, tmp_sf);
else
_save_file(NULL, 0, 0, tmp_sf);
downloads--;
#ifdef DEBUG
tmp_sf->done = true;
#endif
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl);
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code);
if (response_code >= 400)
{
if (tmp_sf->nth_url == 2)
{
fprintf(stderr, "Failed (%ld): %s -> %s (%s) ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft));
fprintf(stderr, "second url: %s\n", tmp_sf->url2);
}
else
{
if (_add_download_files(tmp_sf, su, mhnd, 2) > 0)
downloads++;
}
}
curl_easy_cleanup(cmsg->easy_handle);
}
do
{
// download 1st file
if (iteration == 1 && sf != NULL && downloads < MAX_P_FILE_DOWNLOADS) { if (_add_download_files(sf, su, mhnd, 1) > 0)
downloads++;
}
while (sf != NULL && sf->next != NULL && downloads < MAX_P_FILE_DOWNLOADS) // sf->next must not be NULL as we are adding to the list ;)
{
if (_add_download_files(sf->next, su, mhnd, 1) > 0)
downloads++;
sf = sf->next;
}
} while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0);
break;
}
} while(handles != 0);
while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
{
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private);
tmp_sf = (struct _site_files*)curlinfo_private;
if (tmp_sf->ft == CSS_IMG)
_save_file_css(NULL, 0, 0, tmp_sf);
else
_save_file(NULL, 0, 0, tmp_sf);
downloads--;
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl);
curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code);
if (response_code >= 400)
{
if (tmp_sf->nth_url == 2)
{
fprintf(stderr, "Failed (%ld): %s -> %s (%s) ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft));
fprintf(stderr, "second url: %s\n", tmp_sf->url2);
}
else
{
if (_add_download_files(tmp_sf, su, mhnd, 2) > 0)
downloads++;
}
}
curl_easy_cleanup(cmsg->easy_handle);
#ifdef DEBUG
tmp_sf->done = true;
#endif
}
sf = first_sf;
while (sf != NULL)
{
#ifdef DEBUG
printf("id: %d url: %s url2: %s done: %d\n", sf->id, sf->url, sf->url2, sf->done);
#endif
free(sf->url);
free(sf->url2);
free(sf->filename);
if (sf->ri != NULL)
{
free(sf->ri->userdata);
free(sf->ri);
}
tmp_sf = sf;
sf = sf->next;
free(tmp_sf);
}
#ifdef DEBUG
if (downloads != 0)
{
printf("!!downloads: %d (%s)\n", downloads, su->_base_url);
exit(-1);
}
#endif
curl_multi_cleanup(mhnd);
}
void _css_filter(void *userdata, char *gap, int length, bool gapped)
{
struct _css_filter_userdata *cfu = (struct _css_filter_userdata*) userdata;
char *filename;
if (gapped)
cfu->url = __join_together(cfu->url, gap, length);
else if (!gapped && cfu->url != NULL)
{
filename = _site_files_add(cfu->su, cfu->url, NULL, CSS_IMG);
_user_function(cfu->su, "%s", filename);
free(cfu->url);
cfu->url = NULL;
_user_function(cfu->su, "%.*s", length, gap);
}
else
_user_function(cfu->su, "%.*s", length, gap);
}
static void _getpage_startElementSAX (void * userData, const xmlChar * name, const xmlChar ** atts)
{
int i, j;
char *n = (char*)name;
char *filename, *url;
struct _site_userdata *su = userData;
struct _css_filter_userdata cfu;
struct _replace_info ri;
_user_function(su, "<%s", n); if (atts != NULL) for (i = 0; atts[i] != NULL; i+=2) { filename = NULL; if (!strncasecmp(n, "img", 4) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "input", 6) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "script", 7) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, SCRIPT); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "iframe", 7) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IFRAME); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[i], "style", 6)) { cfu.su = su; cfu.url = NULL; _set_css_ri(&ri, &cfu, _css_filter); _user_function(su, " style=\""); replace(&ri, (char*)atts[i+1], strlen((char*)atts[i+1])); if (cfu.url != NULL) free(cfu.url); _user_function(su, "\""); filename = (void*)-1; } else if (!strncasecmp(n, "link", 5) && !strncasecmp((char*)atts[i], "href", 5)) { for (j = 0; atts[j] != NULL; j+=2) if (!strncasecmp((char*)atts[j], "rel", 4)) { if (!strncasecmp((char*)atts[j+1], "stylesheet", 11)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, STYLE); _user_function(su, " href=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[j+1], "icon", 5)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " href=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[j+1], "shortcut icon", 14)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " href=\"file:%s\"", filename); } } } else if (!strncasecmp(n, "a", 2) && !strncasecmp((char*)atts[i], "href", 5)) { url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1);
_user_function(su, " href=\"%s\"", url);
free(url);
filename = (void*)-1;
}
else if (!strncasecmp(n, "base", 5) && !strncasecmp((char*)atts[i], "href", 5))
{
_user_function(su, " href=\".\"");
filename = (void*)-1;
}
else if (!strncasecmp(n, "form", 5) && !strncasecmp((char*)atts[i], "action", 7))
{
url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1);
_user_function(su, " action=\"%s\"", url);
free(url);
filename = (void*)-1;
}
else if (!strncasecmp(n, "meta", 5) && !strncasecmp((char*)atts[i], "http-equiv", 8) && !strncasecmp((char*)atts[i+1], "Content-Type", 13))
{
su->_utf8_meta_set = true;
_user_function(su, " http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"");
//filename = (void*)-1;
break;
}
if (filename == NULL)
_user_function(su, " %s=\"%s\"", (char*)atts[i], (char*)atts[i+1]);
}
_user_function(su, ">");
}
static void _getpage_endElementSAX (void * userData, const xmlChar * name)
{
char *n = (char*)name;
struct _site_userdata *su = userData;
if (!strncasecmp("head", n, 5) && !su->_utf8_meta_set)
_user_function(su, "<meta http-equiv="\"Content-Type\"" content="\"text/html;" charset="utf-8\"/" /> ");
else if (strncasecmp("br", n, 3) && strncasecmp("img", n, 4) && strncasecmp("meta", n, 5) && strncasecmp("link", n, 5) && strncasecmp("input", n, 5))
_user_function(su, "\n", n);
}
static void _getpage_charDataSAX (void * userData, const xmlChar * buffer, int length)
{
struct _site_userdata *su = userData;
_user_function(su, "%.*s", length, buffer);
}
static size_t _chunk_parse(void *ptr, size_t size, size_t nmemb, xmlParserCtxtPtr ctxt)
{
char *txt = ptr;
#ifdef DEBUG
FILE *fp = fopen("bare.txt", "a+");
fprintf(fp, "%.*s", (int)(size*nmemb), txt);
fclose(fp);
#endif
htmlParseChunk(ctxt, txt, size*nmemb, 0);
return nmemb*size;
}
void getpage(char *url, void *site_function, void *userdata)
{
struct _site_userdata su;
su.site_function = site_function;
su.userdata = userdata;
su.sf = NULL;
su._utf8_meta_set = false;
su._base_url = NULL;
CURLcode ret;
htmlSAXHandler hsh;
htmlParserCtxtPtr ctxt;
#ifdef DEBUG
remove("bare.txt");
#endif
memset(&hsh, 0, sizeof(htmlSAXHandler));
hsh.startElement = _getpage_startElementSAX;
hsh.endElement = _getpage_endElementSAX;
hsh.characters = _getpage_charDataSAX;
ctxt = htmlCreatePushParserCtxt(&hsh, &su, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER);
curl_global_init(CURL_GLOBAL_ALL);
su._hnd = curl_easy_init();
_set_chnd(su._hnd, url, _chunk_parse, ctxt);
ret = curl_easy_perform(su._hnd);
htmlParseChunk(ctxt, NULL, 0, 1);
htmlFreeParserCtxt(ctxt);
curl_easy_getinfo(su._hnd, CURLINFO_EFFECTIVE_URL, &su._base_url);
#ifdef DEBUG
double val;
if (curl_easy_getinfo(su._hnd, CURLINFO_SPEED_DOWNLOAD, &val) == CURLE_OK)
printf("Average download speed: %0.3f kbyte/sec.\n", val / 1024);
#endif
fprintf(stderr, "Downloading files ...\n");
_download_files(&su);
curl_easy_cleanup(su._hnd);
// curl_global_cleanup();
}
(save it as getpage.c)
To use that library:
#include
#include
#include "getpage.h"
void site_function(void *userdata, const char* format, va_list ap)
{
FILE *fp = userdata;
vfprintf(fp, format, ap);
fflush(fp);
}
int main(int argc, char **argv)
{
FILE *fp = fopen(argv[2], "w");
if (fp == NULL)
return -1;
getpage(argv[1], site_function, fp);
fclose(fp);
}
(save that as getpagetest.c)
and now the Makefile (tabulators!!):
CC=/usr/bin/colorgcc
CFLAGS=-O2 -ggdb -Wall
getpagetest: getpage.o getpagetest.c
$(CC) $(CFLAGS) -std=c99 -lxml2 -lcurl -o getpagetest getpagetest.c getpage.o -I /usr/include/libxml2/
getpage.o: getpage.c
$(CC) $(CFLAGS) -o getpage.o -Wall -std=c99 -fPIC getpage.c -I /usr/include/libxml2/ -c
Usage:
make && ./getpagetest heise.de index.html && chromium --proxy-server=localhost:1 index.html