cURL / Mailing Lists / curl-library / Single Mail

curl-library

libcurl multi-interface valgrind

From: todd <taf2_at_lehigh.edu>
Date: Wed, 05 Nov 2003 15:08:14 -0500

Hi,
    I'vebeen using libcurl for awhile the easy interface only. I
recently decided to make
use of the multi-interface for handling asynchronous dns and retrieval
of robots.txt
the attached two files are part of a larger web crawler i'm working on
for a new
search engine. I've been using valgrind to analysis my source and am
fairly confident
that unless i'm mis using curl i've found a 4 byte write error in
libcurl. here is the
stack trace that i get from valgrind.

==20767== Thread 2:
==20767== Invalid write of size 4
==20767== at 0x4038B1E7: Curl_resolv_unlock (hostip.c:366)
==20767== by 0x4039990E: Curl_done (url.c:3210)
==20767== by 0x403A6E4C: curl_multi_perform (multi.c:507)
==20767== by 0x80535A4: NewHostPool::resolver() (NewHost.cc:171)
==20767== Address 0x41AEAF30 is 8 bytes inside a block of size 12 free'd
==20767== at 0x400413D4: free (vg_clientfuncs.c:180)
==20767== by 0x4038B28C: Curl_freednsinfo (hostip.c:397)
==20767== by 0x403A62BC: hash_element_dtor (hash.c:64)
==20767== by 0x403A61A1: Curl_llist_remove (llist.c:137)

attaching gdb at this point gives me the following stack trace:

#0 vg_do_syscall3 (syscallno=4294966784, arg1=20778, arg2=0, arg3=0) at
vg_mylibc.c:92
#1 0x0000512a in ?? ()
#2 0x4038b1e7 in Curl_resolv_unlock (data=0x41bf1b84, dns=0x415ce5b0)
at hostip.c:363
#3 0x4039990e in Curl_done (conn=0x41bf1b84) at url.c:3210
#4 0x403a6e4c in curl_multi_perform (multi_handle=0x419f598c,
running_handles=0x41bf1cd8) at multi.c:502
#5 0x080535a4 in NewHostPool::resolver() (this=0x419f5930) at
crawler/NewHost.cc:171
#6 0x4027a957 in g_thread_create_proxy (data=0x419f65dc) at gthread.c:551
#7 0x4036c5bd in thread_wrapper (info=0xfffffe00) at vg_libpthread.c:635
#8 0x40045845 in do__apply_in_new_thread_bogusRA () at vg_scheduler.c:2023

I don't want to rule out that the error isn't in my usage of libcurl so
that is why i've included
the source for how i'm calling multi interface. In the event that this
really is an error in
libcurl well, i hope this helps someone more familar with the hostip.c
file then I ;)

-todd

#include "NewHost.h"
#include <sstream>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <signal.h>

struct RobotsProxy{
        RobotsProxy( Host *h, volatile bool *s ):host(h),stop(s)
        {
                curl = curl_easy_init();
                if( curl == NULL ){
                        // this is really bad!!
                        fprintf( stderr, "Fatal Error Failed to initialize a curl handle!!\n" );
                        return;
                }
                curl_easy_setopt( curl, CURLOPT_PRIVATE, this );
        }
        ~RobotsProxy(){
                curl_easy_cleanup( curl );
        }
        static size_t robot_retrieve_doc( void *ptr, size_t size, size_t nmemb, void *data );

        bool initialize_host();

        void finished( int rcode );

        CURL *curl;
        Host *host;
        std::string url;
        std::string robots_buffer;
        volatile bool *stop;
};

        void RobotsProxy::finished( int rcode )
        {
                if( !(*stop) && rcode == CURLE_OK && robots_buffer.length() > 0 ){
                        // assume User-agent: *
                        // search for Disallows
                        std::istringstream isrm( robots_buffer );
                        std::string line;
                        while( std::getline( isrm, line ) ){
                                char *loc;
                                if( (loc=icstrstr( line.c_str(), "Disallow:" )) ){
                                        loc += 10; // strlen( "disallow:" ) + 1;
                                        while( *loc != '\0' && isspace( *loc ) ){ ++loc; }
                                        host->no_robots.insert( loc );//hash_term( loc, strlen( loc ) ) );
                                }
                        }
                        host->has_robots_exclusion = true;
                }
                else
                        host->has_robots_exclusion = false;
// printf( "host ready: %s, with ip: %s\n", host->name, host->ipaddr );
                host->resolving = false;
                host->ready = true;
        }
        bool RobotsProxy::initialize_host()
        {
                struct addrinfo *ans, hints;
                memset( &hints, 0, sizeof( hints ) );
                hints.ai_family = PF_UNSPEC;
                hints.ai_socktype = SOCK_STREAM;
// printf( "resolving: %s\n", host->name );
                if( getaddrinfo( host->name, NULL, &hints, &ans ) ){
                        printf( "1failed to resolve host: %s\n", host->name );
                        return true;
                }
                // NOTE: this implies pointer and int are the same size
                int port = (int)host->ipaddr;
                host->ipaddr = (char*)malloc( sizeof(char)*INET6_ADDRSTRLEN );
                const char *p = inet_ntop( ans->ai_family,
                                                                                                                        (void*)&((struct sockaddr_in*)ans->ai_addr)->sin_addr,
                                                                                                                        host->ipaddr, INET6_ADDRSTRLEN );
                freeaddrinfo( ans );
                if( !p ){
                        // better free that ipaddr
                        free( host->ipaddr );
                        host->ipaddr = NULL;
                // printf( "2failed to resolve host: %s\n", host->name );
                        return true;
                }
                if( port != 80 && port != 0 ){
                        char buf[16];
                        snprintf( buf, 16, "%d", port );
                        std::string ipaddr( host->ipaddr );
                        ipaddr += ":";
                        ipaddr += buf;
                        host->ipaddr = (char*)realloc( host->ipaddr, ipaddr.length() + 1 );
                        memcpy( host->ipaddr, ipaddr.c_str(), ipaddr.length()+1 );
                        printf( "port changed for ipaddr: %s\n", host->ipaddr );
                }
        // printf( "resolved host: %s to %s\n", host->name, host->ipaddr );
                
                std::string httpheader_attr( "Host: " );
                httpheader_attr+=host->name;
                host->http_host_header =
                        curl_slist_append( host->http_host_header, httpheader_attr.c_str() );
                // set curl options
                url = "http://";
                url += host->ipaddr;
                url += "/robots.txt";
                curl_easy_setopt( curl, CURLOPT_URL, url.c_str() );
                curl_easy_setopt( curl, CURLOPT_DNS_CACHE_TIMEOUT, -1 ); // never expire
                curl_easy_setopt( curl, CURLOPT_NOSIGNAL, 1 ); // don't allow signals to interrupt the us in a multi-thread env
                curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, robot_retrieve_doc );
                curl_easy_setopt( curl, CURLOPT_FILE, this );
                curl_easy_setopt( curl, CURLOPT_NOPROGRESS, 1 );

                return false;
        }

size_t
RobotsProxy::robot_retrieve_doc( void *ptr, size_t size, size_t nmemb, void *data )
{
        RobotsProxy *proc = (RobotsProxy*)data;
        size_t bytes = nmemb * size;
        proc->robots_buffer.append( (char*)ptr, bytes );
        if( *(proc->stop) ){
                proc->robots_buffer="";
                fprintf( stderr, "CRAWLER DOWNLOAD ABORTING BREAKING OUT OF DOWNLOAD EARLY\n" );
                return 0;
        }
        return bytes;
}

void NewHostPool::resolver_func( gpointer data )
{
        NewHostPool *nhp = (NewHostPool*)data;
        nhp->resolver();
}
void NewHostPool::resolver()
{
        sigset_t sig_to_block;
        sigemptyset( &sig_to_block );
        sigaddset( &sig_to_block, SIGINT );
        sigaddset( &sig_to_block, SIGTERM );
        pthread_sigmask( SIG_BLOCK, &sig_to_block, (sigset_t*)NULL );
// printf( "resolver thread is runnning\n" );
        Host *nework = NULL;
        int running;
        int max;
        struct timeval tval;
        fd_set read;
        fd_set write;
        fd_set error;
        size_t resolved = 0;
        tval.tv_sec = 2; // 2 seconds
        tval.tv_usec= 0;

        while( !stop ){
                // add any new work to the resolver loop
                while( ( nework = (Host*)g_async_queue_try_pop( workque ) ) &&
                                         !stop ){
// printf( "got new work\n" );
                        RobotsProxy *robots = new RobotsProxy( nework, &stop );
                        if( robots->initialize_host() ){
                                // failed to resolve host
                                delete robots;
                                continue;
                        }
                        usleep( 1000 ); // don't hit the DNS server too hard // TODO: user configurable
                        ++resolved;
// printf( "new work added to multi handle\n" );
                        robotsmap.insert( std::map<int,void*>::value_type( (int)robots->curl, (void*)robots ) );
                        curl_multi_add_handle( curlm, robots->curl );
                        curl_multi_perform( curlm, &running );
                }
// printf( "running\r" );
                FD_ZERO( &read );
                FD_ZERO( &write );
                FD_ZERO( &error );
                curl_multi_fdset( curlm, &read, &write, &error, &max );
                switch( select( max+1, &read, &write, &error, &tval ) ){
                case -1: // error
                        break;
                case 0:
                default:
                        // process finished downloads
                        CURLMsg *msg;
                        int numqueued;
                        while( ( msg = curl_multi_info_read( this->curlm, &numqueued ) ) && !stop ){
                                switch( msg->msg ){
                                case CURLMSG_DONE:
                                        RobotsProxy*proxy;
                                        /// NOTE: once this bug in libcurl is fixed we can do away with robotsmap
// if( CURLE_OK != curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &proxy ) ){
                                        // fprintf( stderr, "curl private error\n" );
                                        std::map<int,void*>::iterator it = robotsmap.find( (int)msg->easy_handle );
                                        if( it == robotsmap.end() )
                                                continue;
                                        proxy = (RobotsProxy*)it->second;
                                        proxy->finished( msg->data.result );
// printf( "in resolver: %u\n", resolved);
// }
// else{
// proxy->finished( msg->data.result );
// }
                                        curl_multi_remove_handle( curlm, msg->easy_handle );
                                        g_mutex_lock( cache_lock );
                                        robotsmap.erase( it );
                                        DNSCache::iterator loc = cache.find( (const char*)proxy->host->name );
                                        if( loc == cache.end() ){
                                                cache.insert( DNSCache::value_type( proxy->host->name, proxy->host->ipaddr ) );
                                        }
                                        else{
                                                free( proxy->host->ipaddr ); // duplicated
                                        }
                                        g_mutex_unlock( cache_lock );

                                        delete proxy;
                                        break;
                                }
                        }

                        break;
                }
                curl_multi_perform( this->curlm, &running );
                usleep( 50 );
        }
}
NewHostPool::NewHostPool( volatile bool &s ): stop(s)
{
        curlm = curl_multi_init();
        workque = g_async_queue_new();
        cache_lock = g_mutex_new();
        thread = g_thread_create( (GThreadFunc)resolver_func, this, true, NULL );
}
NewHostPool::~NewHostPool()
{
        g_mutex_lock( cache_lock );
        printf( "printing the number of hosts\n" );
        printf( "cached a total of : %u hosts\n", cache.size() );
        g_mutex_unlock( cache_lock );

        printf( "joining the new host pool\n" );
        g_thread_join( thread );
        printf( "new host pool joined\n" );
        std::vector<CURL*> handles;
        for( std::map<int,void*>::iterator it = robotsmap.begin(); it != robotsmap.end(); ++it ){
                handles.push_back( (CURL*)it->first );
// curl_multi_remove_handle( curlm, (CURL*)it->first );
                if( ((RobotsProxy*)it->second)->host->ipaddr ){
                        free( ((RobotsProxy*)it->second)->host->ipaddr );
                }
                delete (RobotsProxy*)it->second;
        }
        for( int i = 0; i < handles.size(); ++i ){
                curl_multi_remove_handle( curlm, handles[i] );
        }
        curl_multi_cleanup( curlm );
        g_async_queue_unref( workque );
        
        g_mutex_lock( cache_lock );
        printf( "freeing ipaddrs\n" );
        for( DNSCache::iterator it = cache.begin(); it != cache.end(); ++it ){
                free( it->second );
        }
        g_mutex_unlock( cache_lock );

        g_mutex_free( cache_lock );
}
void NewHostPool::prepare_host( Host *host )
{
        if( host->ready || host->resolving ){
                printf( "host is already ready!!\n" );
                return;
        }
        // check cache first
        g_mutex_lock( cache_lock ); // lock the cache
        DNSCache::iterator it = cache.find( host->name );
        if( it != cache.end() ){
// if( cache.find( host->name, host->ipaddr ) ){
                host->ipaddr = it->second;
                g_mutex_unlock( cache_lock ); // unlock the cache
                
                std::string httpheader_attr( "Host: " );
                httpheader_attr+=host->name;
                host->http_host_header = NULL;
                host->http_host_header = curl_slist_append( host->http_host_header,
                                                                                                                                                                                                /*strdup( */httpheader_attr.c_str() );// );
        // NOTE: ?? does curl_slist_append create a copy of the string ??
                host->ready = true;
        }
        else{
                g_mutex_unlock( cache_lock ); // unlock the cache
                host->resolving = true;
                g_async_queue_push( workque, host );
        }
}

#ifndef NEW_HOST_H
#define NEW_HOST_H
#include "URL.h"
#include <glib.h>
#include <curl/curl.h>
#include <curl/types.h>
#include <curl/easy.h>
#include <curl/multi.h>
#include <map> // needed to handle libcurl error with private data
#include "common.h"
#include "hash.h"
#include "hash_table.h"
// will resolve host and
// fetch robots.txt if it exists
struct NewHostPool{
        NewHostPool( volatile bool &stop );
        ~NewHostPool();

        void prepare_host( Host *host );

        void resolver();
private:

        GAsyncQueue *workque;
        CURLM *curlm;
        GThread *thread;
        GMutex *cache_lock;
        volatile bool &stop;
        static void resolver_func( gpointer data );
        std::map<int,void*> robotsmap;
/* typedef HashTable<char*, // host name
                                                                                char*, // host ipaddr
                                                                                HashKey,str_hash,ltstr> *//*std::map<HashKey,char*>*/
        typedef std::map<const char*,char*,ltstr> DNSCache;
        DNSCache cache;// set of resolved or resolving hosts
};

#endif

-------------------------------------------------------
This SF.net email is sponsored by: SF.net Giveback Program.
Does SourceForge.net help you be more productive? Does it
help you create better code? SHARE THE LOVE, and help us help
YOU! Click Here: http://sourceforge.net/donate/
Received on 2003-11-06