cURL / Mailing Lists / curl-library / Single Mail

curl-library

(no subject)

From: <crillion_at_tiscali.it>
Date: Wed, 22 Apr 2009 19:27:04 +0200 (CEST)

Hi,

    I'm using libcurl 7.19.4 with Visual C++ 2008 express edition
(9.0.30729.1 SP) on Windows Vista Business. I'm trying to get the html
of an online dictionary page, but:

- the page has some sort of
redirect (I'm not able to block it from browsers)
- libcurl gets only a
partial and broken html

In my code, I define a webpage() method that,
given an url, retrieves the html code.
I directly took the code from an
example on the curl website.
Here's it:

        static void *myrealloc(void
*ptr, size_t size);
        static size_t WriteMemoryCallback(void *ptr,
size_t size, size_t nmemb, void *data);

        struct MemoryStruct {
                char*
memory;
                size_t size;
        };

        static void *myrealloc(void *ptr, size_t
size)
        {
                if(ptr) return realloc(ptr, size);
                else return malloc
(size);
        }

        static size_t WriteMemoryCallback(void *ptr, size_t size,
size_t nmemb, void *data)
        {
                size_t realsize = size * nmemb;
                struct
MemoryStruct *mem = (struct MemoryStruct*)data;
                
                mem->memory =
(char*)myrealloc(mem->memory, mem->size + realsize + 1);
                if(mem-
>memory) {
                        memcpy(&(mem->memory[mem->size]), ptr, realsize);
                        mem-
>size += realsize;
                        mem->memory[mem->size] = 0;
                }
                
                return
realsize;
        }

        string webpage(const string& url)
        {

                CURL
*curl_handle;
                
                struct MemoryStruct chunk;
                
                chunk.memory = NULL;

                chunk.size = 0;
                
                curl_global_init(CURL_GLOBAL_ALL);

                
curl_handle = curl_easy_init();
                
                curl_easy_setopt(curl_handle,
CURLOPT_URL, url.c_str());
                
                curl_easy_setopt(curl_handle,
CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
                
                curl_easy_setopt
(curl_handle, CURLOPT_WRITEDATA, (void*)&chunk);
                
                curl_easy_setopt
(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
                
                
curl_easy_perform(curl_handle);
                
                curl_easy_cleanup(curl_handle);
                

                string s;
                
                if(chunk.memory)
                {
                        s = string(chunk.memory);
                        
free(chunk.memory);
                }
                
                curl_global_cleanup();

                return s;
        }

Then I simply call it with the url "http://old.demauroparavia.
it//@rozzo"

  string html = webpage("http://old.demauroparavia.
it//@rozzo");

but here's what it retrieves(I include it in an html
comment, hope this is enough to block html rendering):
<!--

</div>

<!-- fine contenuto -->

<!-- inizio menu di navigazione -->
<div id="
menu">

<h2>Per saperne di pi&ugrave;</h2>
<ol>
<li><a href="
avvertenze" title="consulta questa sezione per meglio comprendere le
voci del dizionario">Avvertenze per la consultazione </a></li>
<li><a
href="avanzata" title="consulta le indicazioni per effettuare ricerche
complesse">Ricerca avanzata </a></li><li><a href="abbreviazioni" title="
consulta l'elenco delle abbreviazioni presenti nel dizionario"
>Abbreviazioni </a></li><li><a href="quadri/" title="consulta la tavola
delle coniugazioni">Tavole delle coniugazioni dei verbi </a></li><li><a
href="lemmario" title="consulta il lemmario completo">Lemmario completo
</a></li><li><a href="consultati" title="consulta la statistica dei 500
lemmi pi&ugrave; consultati">I lemmi pi&ugrave; consultati </a></li>

</ol>

<h2>Collegamenti</h2>
<ol>
<li><a href="http://www.paravia.
it/vetrina_diz.php" title="Vai alla pagina dei dizionari cartacei sul
sito Paravia">I dizionari Paravia </a></li>
<li><a href="http:
//oxfordparavia.it" title="Vai al Dizionario italiano-inglese Oxford
Paravia Concise">Dizionario italiano-inglese<br />
Oxford Paravia
Concise </a></li>
<li><a href="http://dizionariodai.it" title="Vai al
Dizionario DAI Paravia">Dizionario di apprendimento<br />della lingua
inglese </a></li>
<li><a href="http://www.ldoceonline.com/" title="Vai
al Dizionario Inglese Longman">Dizionario inglese Longman </a></li>

<li><a href="http://www.mozillaitalia.org/searchplugins/" title="Vai al
sito ufficiale Mozilla per scaricare il Plugin De Mauro per FireFox">De
Mauro plugin per FireFox </a></li>
<!--<li><a href="http://www.apple.
com/downloads/dashboard/search/demauro.html" title="Vai al sito Apple e
scarica il Widget per Dashboard su Mac OS X">Dashboard Widget per Mac
</a></li>-->
</ol>

<h2>Scrivi alla redazione</h2>
<ol>
<li><a href="
email" title="vai alla pagina dalla quale puoi spedire una email alla
redazione">&Egrave; possibile contattare la redazione per chiarimenti o
per segnalare eventuali errori </a></li>
</ol>

<!-- Include the
Google Friend Connect javascript library. -->
<script type="
text/javascript" src="http://www.google.
com/friendconnect/script/friendconnect.js"></script>

<!-- Define the
div tag where the gadget will be inserted. -->
<div id="div-
1230808844540" style="width:196px;margin-left:-10px;border:1px solid
#cccccc;"></div>
<!-- Render the gadget into a div. -->
<script type="
text/javascript">
var skin = {};
skin['HEIGHT'] = '320';
skin
['BORDER_COLOR'] = '#cccccc';
skin['ENDCAP_BG_COLOR'] = '#e0ecff';
skin
['ENDCAP_TEXT_COLOR'] = '#333333';
skin['ENDCAP_LINK_COLOR'] =
'#0000cc';
skin['ALTERNATE_BG_COLOR'] = '#ffffff';
skin
['CONTENT_BG_COLOR'] = '#ffffff';
skin['CONTENT_LINK_COLOR'] =
'#0000cc';
skin['CONTENT_TEXT_COLOR'] = '#333333';
skin
['CONTENT_SECONDARY_LINK_COLOR'] = '#7777cc';
skin
['CONTENT_SECONDARY_TEXT_COLOR'] = '#666666';
skin
['CONTENT_HEADLINE_COLOR'] = '#333333';
google.friendconnect.container.
setParentUrl('/' /* location of rpc_relay.html and canvas.html */);

google.friendconnect.container.renderMembersGadget(
 { id: 'div-
1230808844540',
   site: '02173626207485754890'},
  skin);
</script>

</div>
<!-- fine menu di navigazione -->

<!-- inizio colonnino Google
-->
<div id="pubblicita">
<script type="text/javascript"><!--

google_ad_client = "pub-5705799158375171";
google_ad_width = 160;

google_ad_height = 600;
google_ad_format = "160x600_as";
google_ad_type
= "text_image";
google_ad_channel ="";
google_color_border = "E1F8F9";

google_color_bg = "E1F8F9";
google_color_link = "CC0000";

google_color_text = "000000";
google_color_url = "0066CC";
//--
></script>
<script type="text/javascript"
  src="http://pagead2.
googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<!-- fine
colonnino Google -->

<!-- inizio piede di pagina -->
<div id="
piedipagina">
        <a href="http://www.paravia.it" title="vai al sito
Paravia" class="copy">PARAVIA</a> - <a href="http://www.ppbm.it" title="
vai al sito Pearson Paravia Bruno Mondadori" class="copy">Pearson
Paravia Bruno Mondadori</a> &copy; 1999-2007, tutti i diritti
riservati, P.I. 07415430011
</div>
<div id="certificazioni">
        <p>
        <a
href="http://validator.w3.org/check?uri=referer"><img style="border:0;
width:88px;height:31px" src="http://www.w3.org/Icons/valid-xhtml10"
alt="Valid XHTML 1.0 Strict" height="31" width="88" /></a>
        <a href="
http://jigsaw.w3.org/css-validator/validator?uri=http%3A%2F%2Fwww.
demauroparavia.it"><img style="border:0;width:88px;height:31px" src="
http://jigsaw.w3.org/css-validator/images/vcss" alt="Valid CSS!" /></a>

        <a href="http://www.w3.org/WAI/WCAG1AA-Conformance" title="Explanation
of Level Double-A Conformance"><img style="border:0;width:88px;height:
31px" height="32" width="88" src="http://www.w3.org/WAI/wcag1AA" alt="
Level Double-A conformance icon, W3C-WAI Web Content Accessibility
Guidelines 1.0"/></a>
        </p>
</div>
<!-- fine piede di pagina -->

<script type="text/JavaScript" >
<!--
document.getElementById
('textfield').focus();
document.getElementById('textfield').select();

--
>
</script>
</body></html>
--->
this block of html is trunked. I 
cannot figure what happens on the browser (firefox 3.0.9 - it 
automatically goes to the other one), I cannot block this redirect even 
if I turn off automatic META redirects in the advanced tab of the 
options window, nor if I turn off javascript.
I'm interested in getting 
the correct html of the page, or a way to get the next, encoded url, 
http://old.demauroparavia.it/99573  , which I can read correctly.
   
Many thanks,
               Marco 
Su Tiscali Photo c'รจ la Promozione di Primavera. Stampa le tue foto nei formati 13x17 o 13x19 a soli 0,11 euro
http://photo.tiscali.it
Received on 2009-04-22