htmltitle.cpp
/*************************************************************************** * _ _ ____ _ * Project ___| | | | _ \| | * / __| | | | |_) | | * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms * are also available at https://curl.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is * furnished to do so, under the terms of the COPYING file. * * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY * KIND, either express or implied. * * SPDX-License-Identifier: curl * ***************************************************************************/ /* <DESC> * Get a webpage, extract the title with libxml. * </DESC> Written by Lars Nilsson GNU C++ compile command line suggestion (edit paths accordingly): g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <string> #include <curl/curl.h> #include <libxml/HTMLparser.h> // // Case-insensitive string comparison // #ifdef _WIN32 #define COMPARE(a, b) (!_stricmp(a, b)) #else #define COMPARE(a, b) (!strcasecmp(a, b)) #endif // // libxml callback context structure // struct Context { Context() : addTitle(false) {} bool addTitle; std::string title; }; // // libcurl variables for error strings and returned data static char errorBuffer[CURL_ERROR_SIZE]; static std::string buffer; // // libcurl write callback function // static size_t writer(char *data, size_t size, size_t nmemb, std::string *writerData) { if(writerData == NULL) return 0; writerData->append(data, size * nmemb); return size * nmemb; } // // libcurl connection initialization // static bool init(CURL *&curl, const char *url) { CURLcode result; curl = curl_easy_init(); if(!curl) { fprintf(stderr, "Failed to create CURL handle\n"); return false; } result = curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errorBuffer); if(result != CURLE_OK) { fprintf(stderr, "Failed to set error buffer [%d]\n", result); return false; } result = curl_easy_setopt(curl, CURLOPT_URL, url); if(result != CURLE_OK) { fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); return false; } result = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); if(result != CURLE_OK) { fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); return false; } result = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer); if(result != CURLE_OK) { fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); return false; } result = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer); if(result != CURLE_OK) { fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); return false; } return true; } // // libxml start element callback function // static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes) { Context *context = static_cast<Context *>(voidContext); if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) { context->title = ""; context->addTitle = true; } (void)attributes; } // // libxml end element callback function // static void EndElement(void *voidContext, const xmlChar *name) { Context *context = static_cast<Context *>(voidContext); if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) context->addTitle = false; } // // Text handling helper function // static void handleCharacters(Context *context, const xmlChar *chars, int length) { if(context->addTitle) context->title.append(reinterpret_cast<const char *>(chars), (unsigned long)length); } // // libxml PCDATA callback function // static void Characters(void *voidContext, const xmlChar *chars, int length) { Context *context = static_cast<Context *>(voidContext); handleCharacters(context, chars, length); } // // libxml CDATA callback function // static void cdata(void *voidContext, const xmlChar *chars, int length) { Context *context = static_cast<Context *>(voidContext); handleCharacters(context, chars, length); } // // libxml SAX callback structure // static htmlSAXHandler saxHandler = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, StartElement, EndElement, NULL, Characters, NULL, NULL, NULL, NULL, NULL, NULL, NULL, cdata, NULL, 0, 0, 0, 0, NULL }; // // Parse given (assumed to be) HTML text and return the title // static void parseHtml(const std::string &html, std::string &title) { htmlParserCtxtPtr ctxt; Context context; ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE); htmlParseChunk(ctxt, html.c_str(), (int)html.size(), 0); htmlParseChunk(ctxt, "", 0, 1); htmlFreeParserCtxt(ctxt); title = context.title; } int main(int argc, const char *argv[]) { CURL *curl = NULL; CURLcode result; std::string title; // Ensure one argument is given if(argc != 2) { fprintf(stderr, "Usage: %s <url>\n", argv[0]); return EXIT_FAILURE; } result = curl_global_init(CURL_GLOBAL_ALL); if(result != CURLE_OK) return (int)result; // Initialize CURL handle if(!init(curl, argv[1])) { fprintf(stderr, "Handle initialization failed\n"); curl_global_cleanup(); return EXIT_FAILURE; } // Retrieve content for the URL result = curl_easy_perform(curl); curl_easy_cleanup(curl); if(result != CURLE_OK) { fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); return EXIT_FAILURE; } // Parse the (assumed) HTML code parseHtml(buffer, title); // Display the extracted title printf("Title: %s\n", title.c_str()); return (int)result; }
Notice
This source code example is simplified and may ignore return
codes and error checks. We do this to highlight the libcurl function calls and
related options and reduce unrelated code.
A real-world application does of course properly check every return value and exit correctly at the first serious error.