curl / libcurl / API / Examples / htmltitle.cpp

htmltitle.cpp

/***************************************************************************
 *                                  _   _ ____  _
 *  Project                     ___| | | |  _ \| |
 *                             / __| | | | |_) | |
 *                            | (__| |_| |  _ <| |___
 *                             \___|\___/|_| \_\_____|
 *
 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
 *
 * This software is licensed as described in the file COPYING, which
 * you should have received as part of this distribution. The terms
 * are also available at https://curl.se/docs/copyright.html.
 *
 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
 * copies of the Software, and permit persons to whom the Software is
 * furnished to do so, under the terms of the COPYING file.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 * SPDX-License-Identifier: curl
 *
 ***************************************************************************/
/* <DESC>
 * Get a webpage, extract the title with libxml.
 * </DESC>
 
  Written by Lars Nilsson
 
  GNU C++ compile command line suggestion (edit paths accordingly):
 
  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ 
    -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <string>
 
#include <curl/curl.h>
 
#include <libxml/HTMLparser.h>
 
//
// Case-insensitive string comparison
//

#ifdef _WIN32
#define COMPARE(a, b) (!_stricmp(a, b))
#else
#define COMPARE(a, b) (!strcasecmp(a, b))
#endif
 
//
// libxml callback context structure
//
struct Context {
  Context() : addTitle(false) {}
 
  bool addTitle;
  std::string title;
};
 
//
// libcurl variables for error strings and returned data

static char errorBuffer[CURL_ERROR_SIZE];
static std::string buffer;
 
//
// libcurl write callback function
//
static size_t writer(char *data, size_t size, size_t nmemb,
                     std::string *writerData)
{
  if(writerData == NULL)
    return 0;
 
  writerData->append(data, size * nmemb);
 
  return size * nmemb;
}
 
//
// libcurl connection initialization
//
static bool init(CURL *&curl, const char *url)
{
  CURLcode result;
 
  curl = curl_easy_init();
 
  if(!curl) {
    fprintf(stderr, "Failed to create CURL handle\n");
    return false;
  }
 
  result = curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errorBuffer);
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to set error buffer [%d]\n", result);
    return false;
  }
 
  result = curl_easy_setopt(curl, CURLOPT_URL, url);
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
    return false;
  }
 
  result = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
    return false;
  }
 
  result = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
    return false;
  }
 
  result = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
    return false;
  }
 
  return true;
}
 
//
// libxml start element callback function
//
static void StartElement(void *voidContext,
                         const xmlChar *name,
                         const xmlChar **attributes)
{
  Context *context = static_cast<Context *>(voidContext);
 
  if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) {
    context->title = "";
    context->addTitle = true;
  }
  (void)attributes;
}
 
//
// libxml end element callback function
//
static void EndElement(void *voidContext,
                       const xmlChar *name)
{
  Context *context = static_cast<Context *>(voidContext);
 
  if(COMPARE(reinterpret_cast<const char *>(name), "TITLE"))
    context->addTitle = false;
}
 
//
// Text handling helper function
//
static void handleCharacters(Context *context,
                             const xmlChar *chars,
                             int length)
{
  if(context->addTitle)
    context->title.append(reinterpret_cast<const char *>(chars),
                          (unsigned long)length);
}
 
//
// libxml PCDATA callback function
//
static void Characters(void *voidContext,
                       const xmlChar *chars,
                       int length)
{
  Context *context = static_cast<Context *>(voidContext);
 
  handleCharacters(context, chars, length);
}
 
//
// libxml CDATA callback function
//
static void cdata(void *voidContext,
                  const xmlChar *chars,
                  int length)
{
  Context *context = static_cast<Context *>(voidContext);
 
  handleCharacters(context, chars, length);
}
 
//
// libxml SAX callback structure
//
static htmlSAXHandler saxHandler = {
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  StartElement,
  EndElement,
  NULL,
  Characters,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  cdata,
  NULL,
  0,
  0,
  0,
  0,
  NULL
};
 
//
// Parse given (assumed to be) HTML text and return the title
//
static void parseHtml(const std::string &html,
                      std::string &title)
{
  htmlParserCtxtPtr ctxt;
  Context context;
 
  ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
                                  XML_CHAR_ENCODING_NONE);
 
  htmlParseChunk(ctxt, html.c_str(), (int)html.size(), 0);
  htmlParseChunk(ctxt, "", 0, 1);
 
  htmlFreeParserCtxt(ctxt);
 
  title = context.title;
}
 
int main(int argc, const char *argv[])
{
  CURL *curl = NULL;
  CURLcode result;
  std::string title;
 
  // Ensure one argument is given

  if(argc != 2) {
    fprintf(stderr, "Usage: %s <url>\n", argv[0]);
    return EXIT_FAILURE;
  }
 
  result = curl_global_init(CURL_GLOBAL_ALL);
  if(result != CURLE_OK)
    return (int)result;
 
  // Initialize CURL handle

  if(!init(curl, argv[1])) {
    fprintf(stderr, "Handle initialization failed\n");
    curl_global_cleanup();
    return EXIT_FAILURE;
  }
 
  // Retrieve content for the URL

  result = curl_easy_perform(curl);
  curl_easy_cleanup(curl);
 
  if(result != CURLE_OK) {
    fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
    return EXIT_FAILURE;
  }
 
  // Parse the (assumed) HTML code
  parseHtml(buffer, title);
 
  // Display the extracted title
  printf("Title: %s\n", title.c_str());
 
  return (int)result;
}

Notice

This source code example is simplified and may ignore return codes and error checks. We do this to highlight the libcurl function calls and related options and reduce unrelated code.

A real-world application does of course properly check every return value and exit correctly at the first serious error.