blob: da3354a55363ff1bb571f82f4e99b26a841736a5 [file] [log] [blame]
Lucas Eckels9bd90e62012-08-06 15:07:02 -07001/*****************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 */
9
10// Get a web page, parse it with libxml.
11//
12// Written by Lars Nilsson
13//
14// GNU C++ compile command line suggestion (edit paths accordingly):
15//
16// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
17// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
18
19#include <stdio.h>
20#include <string.h>
21#include <stdlib.h>
22#include <string>
23#include <curl/curl.h>
24#include <libxml/HTMLparser.h>
25
26//
27// Case-insensitive string comparison
28//
29
30#ifdef _MSC_VER
31#define COMPARE(a, b) (!stricmp((a), (b)))
32#else
33#define COMPARE(a, b) (!strcasecmp((a), (b)))
34#endif
35
36//
37// libxml callback context structure
38//
39
40struct Context
41{
42 Context(): addTitle(false) { }
43
44 bool addTitle;
45 std::string title;
46};
47
48//
49// libcurl variables for error strings and returned data
50
51static char errorBuffer[CURL_ERROR_SIZE];
52static std::string buffer;
53
54//
55// libcurl write callback function
56//
57
58static int writer(char *data, size_t size, size_t nmemb,
59 std::string *writerData)
60{
61 if (writerData == NULL)
62 return 0;
63
64 writerData->append(data, size*nmemb);
65
66 return size * nmemb;
67}
68
69//
70// libcurl connection initialization
71//
72
73static bool init(CURL *&conn, char *url)
74{
75 CURLcode code;
76
77 conn = curl_easy_init();
78
79 if (conn == NULL)
80 {
81 fprintf(stderr, "Failed to create CURL connection\n");
82
83 exit(EXIT_FAILURE);
84 }
85
86 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
87 if (code != CURLE_OK)
88 {
89 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
90
91 return false;
92 }
93
94 code = curl_easy_setopt(conn, CURLOPT_URL, url);
95 if (code != CURLE_OK)
96 {
97 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
98
99 return false;
100 }
101
102 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
103 if (code != CURLE_OK)
104 {
105 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
106
107 return false;
108 }
109
110 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
111 if (code != CURLE_OK)
112 {
113 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
114
115 return false;
116 }
117
118 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
119 if (code != CURLE_OK)
120 {
121 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
122
123 return false;
124 }
125
126 return true;
127}
128
129//
130// libxml start element callback function
131//
132
133static void StartElement(void *voidContext,
134 const xmlChar *name,
135 const xmlChar **attributes)
136{
137 Context *context = (Context *)voidContext;
138
139 if (COMPARE((char *)name, "TITLE"))
140 {
141 context->title = "";
142 context->addTitle = true;
143 }
144 (void) attributes;
145}
146
147//
148// libxml end element callback function
149//
150
151static void EndElement(void *voidContext,
152 const xmlChar *name)
153{
154 Context *context = (Context *)voidContext;
155
156 if (COMPARE((char *)name, "TITLE"))
157 context->addTitle = false;
158}
159
160//
161// Text handling helper function
162//
163
164static void handleCharacters(Context *context,
165 const xmlChar *chars,
166 int length)
167{
168 if (context->addTitle)
169 context->title.append((char *)chars, length);
170}
171
172//
173// libxml PCDATA callback function
174//
175
176static void Characters(void *voidContext,
177 const xmlChar *chars,
178 int length)
179{
180 Context *context = (Context *)voidContext;
181
182 handleCharacters(context, chars, length);
183}
184
185//
186// libxml CDATA callback function
187//
188
189static void cdata(void *voidContext,
190 const xmlChar *chars,
191 int length)
192{
193 Context *context = (Context *)voidContext;
194
195 handleCharacters(context, chars, length);
196}
197
198//
199// libxml SAX callback structure
200//
201
202static htmlSAXHandler saxHandler =
203{
204 NULL,
205 NULL,
206 NULL,
207 NULL,
208 NULL,
209 NULL,
210 NULL,
211 NULL,
212 NULL,
213 NULL,
214 NULL,
215 NULL,
216 NULL,
217 NULL,
218 StartElement,
219 EndElement,
220 NULL,
221 Characters,
222 NULL,
223 NULL,
224 NULL,
225 NULL,
226 NULL,
227 NULL,
228 NULL,
229 cdata,
230 NULL
231};
232
233//
234// Parse given (assumed to be) HTML text and return the title
235//
236
237static void parseHtml(const std::string &html,
238 std::string &title)
239{
240 htmlParserCtxtPtr ctxt;
241 Context context;
242
243 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
244 XML_CHAR_ENCODING_NONE);
245
246 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
247 htmlParseChunk(ctxt, "", 0, 1);
248
249 htmlFreeParserCtxt(ctxt);
250
251 title = context.title;
252}
253
254int main(int argc, char *argv[])
255{
256 CURL *conn = NULL;
257 CURLcode code;
258 std::string title;
259
260 // Ensure one argument is given
261
262 if (argc != 2)
263 {
264 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
265
266 exit(EXIT_FAILURE);
267 }
268
269 curl_global_init(CURL_GLOBAL_DEFAULT);
270
271 // Initialize CURL connection
272
273 if (!init(conn, argv[1]))
274 {
275 fprintf(stderr, "Connection initializion failed\n");
276
277 exit(EXIT_FAILURE);
278 }
279
280 // Retrieve content for the URL
281
282 code = curl_easy_perform(conn);
283 curl_easy_cleanup(conn);
284
285 if (code != CURLE_OK)
286 {
287 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
288
289 exit(EXIT_FAILURE);
290 }
291
292 // Parse the (assumed) HTML code
293
294 parseHtml(buffer, title);
295
296 // Display the extracted title
297
298 printf("Title: %s\n", title.c_str());
299
300 return EXIT_SUCCESS;
301}