Blame - doc/tutorial/xmltutorial.xml - platform/external/libxml2

blob: bfd8c1da32541b995d9ed569791100dc6dff15ad [file] [log] [blame]

MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	1	<?xml version="1.0"?>
				2	<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
				3	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
				4	<!ENTITY KEYWORD SYSTEM "includekeyword.c">
				5	<!ENTITY STORY SYSTEM "includestory.xml">
				6	<!ENTITY ADDKEYWORD SYSTEM "includeaddkeyword.c">
				7	<!ENTITY ADDATTRIBUTE SYSTEM "includeaddattribute.c">
MDT 2002 John Fleck	5452083	2002-06-13 03:30:26 +0000	[diff] [blame]	8	<!ENTITY GETATTRIBUTE SYSTEM "includegetattribute.c">
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	9	]>
				10	<article>
				11	<articleinfo>
				12	<title>Libxml Tutorial</title>
				13	<author>
				14	<firstname>John</firstname>
				15	<surname>Fleck</surname>
				16	</author>
				17	<copyright>
				18	<year>2002</year>
				19	<holder>John Fleck</holder>
				20	</copyright>
				21	<revhistory>
				22	<revision>
				23	<revnumber>1</revnumber>
				24	<date>June 4,2002</date>
				25	</revision>
MDT 2002 John Fleck	5452083	2002-06-13 03:30:26 +0000	[diff] [blame]	26	<revision>
				27	<revnumber>2</revnumber>
				28	<date>June 12, 2002</date>
				29	</revision>
MDT 2002 John Fleck	77e4d35	2002-09-01 01:37:11 +0000	[diff] [blame]	30	<revision>
				31	<revnumber>3</revnumber>
				32	<date>Aug. 31, 2002</date>
				33	</revision>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	34	</revhistory>
				35	</articleinfo>
				36	<abstract>
				37	<para>Libxml is a freely licensed C language library for handling
				38	<acronym>XML</acronym>, portable across a large number of platforms. This
				39	tutorial provides examples of its basic functions.</para>
				40	</abstract>
				41	<sect1 id="introduction">
				42	<title>Introduction</title>
				43	<para>Libxml is a C language library implementing functions for reading,
				44	creating and manipulating <acronym>XML</acronym> data. This tutorial
				45	provides example code and explanations of its basic functionality.</para>
				46	<para>Libxml and more details about its use are available on <ulink
				47	url="http://www.xmlsoft.org/">the project home page</ulink>. Included there is complete <ulink url="http://xmlsoft.org/html/libxml-lib.html">
				48	<acronym>API</acronym> documentation</ulink>. This tutorial is not meant
				49	to substitute for that complete documentation, but to illustrate the
				50	functions needed to use the library to perform basic operations.
				51	<!--
				52	Links to
				53	other resources can be found in <xref linkend="furtherresources" />.
				54	-->
				55	</para>
				56	<para>The tutorial is based on a simple <acronym>XML</acronym> application I
				57	use for articles I write. The format includes metadata and the body
				58	of the article.</para>
				59	<para>The example code in this tutorial demonstrates how to:
				60	<itemizedlist>
				61	<listitem>
				62	<para>Parse the document.</para>
				63	</listitem>
				64	<listitem>
				65	<para>Extract the text within a specified element.</para>
				66	</listitem>
				67	<listitem>
				68	<para>Add an element and its content.</para>
				69	</listitem>
				70	<listitem>
MDT 2002 John Fleck	5452083	2002-06-13 03:30:26 +0000	[diff] [blame]	71	<para>Add an attribute.</para>
				72	</listitem>
				73	<listitem>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	74	<para>Extract the value of an attribute.</para>
				75	</listitem>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	76	</itemizedlist>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	77	</para>
				78	<para>Full code for the examples is included in the appendices.</para>
				79
				80	</sect1>
				81
				82	<sect1 id="xmltutorialdatatypes">
				83	<title>Data Types</title>
				84	<para><application>Libxml</application> declares a number of datatypes we
				85	will encounter repeatedly, hiding the messy stuff so you do not have to deal
				86	with it unless you have some specific need.</para>
				87	<para>
				88	<variablelist>
				89	<varlistentry>
				90	<term><ulink
				91	url="http://xmlsoft.org/html/libxml-tree.html#XMLCHAR">xmlChar</ulink></term>
				92	<listitem>
				93	<para>A basic replacement for char, a byte in a UTF-8 encoded
				94	string.</para>
				95	</listitem>
				96	</varlistentry>
				97	<varlistentry>
				98	<term>
				99	<ulink url="http://xmlsoft.org/html/libxml-tree.html#XMLDOC">xmlDoc</ulink></term>
				100	<listitem>
				101	<para>A structure containing the tree created by a parsed doc. <ulink
				102	url="http://xmlsoft.org/html/libxml-tree.html#XMLDOCPTR">xmlDocPtr</ulink>
				103	is a pointer to the structure.</para>
				104	</listitem>
				105	</varlistentry>
				106	<varlistentry>
				107	<term><ulink
				108	url="http://xmlsoft.org/html/libxml-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
				109	and <ulink url="http://xmlsoft.org/html/libxml-tree.html#XMLNODE">xmlNode</ulink></term>
				110	<listitem>
				111	<para>A structure containing a single node. <ulink
				112	url="http://xmlsoft.org/html/libxml-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
				113	is a pointer to the structure, and is used in traversing the document tree.</para>
				114	</listitem>
				115	</varlistentry>
				116	</variablelist>
				117	</para>
				118
				119	</sect1>
				120
				121	<sect1 id="xmltutorialparsing">
				122	<title>Parsing the file</title>
				123	<para>Parsing the file requires only the name of the file and a single
				124	function call, plus error checking. Full code: <xref
				125	linkend="keywordappendix" /></para>
				126	<para>
				127	<programlisting>
				128	<co id="declaredoc" /> xmlDocPtr doc;
				129	<co id="declarenode" /> xmlNodePtr cur;
				130
				131	<co id="parsefile" /> doc = xmlParseFile(docname);
				132
				133	<co id="checkparseerror" /> if (doc == NULL ) {
				134	fprintf(stderr,"Document not parsed successfully. \n");
John Fleck	be98b33	2002-09-04 03:16:23 +0000	[diff] [blame^]	135	xmlFreeDoc(doc);
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	136	return;
				137	}
				138
				139	<co id="getrootelement" /> cur = xmlDocGetRootElement(doc);
				140
				141	<co id="checkemptyerror" /> if (cur == NULL) {
				142	fprintf(stderr,"empty document\n");
				143	xmlFreeDoc(doc);
				144	return;
				145	}
				146
				147	<co id="checkroottype" /> if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
				148	fprintf(stderr,"document of the wrong type, root node != story");
				149	xmlFreeDoc(doc);
				150	return;
				151	}
				152
				153	</programlisting>
				154	<calloutlist>
				155	<callout arearefs="declaredoc">
				156	<para>Declare the pointer that will point to your parsed document.</para>
				157	</callout>
				158	<callout arearefs="declarenode">
				159	<para>Declare a node pointer (you'll need this in order to
				160	interact with individual nodes).</para>
				161	</callout>
				162	<callout arearefs="checkparseerror">
				163	<para>Check to see that the document was successfully parsed.</para>
				164	</callout>
				165	<callout arearefs="getrootelement">
				166	<para>Retrieve the document's root element.</para>
				167	</callout>
				168	<callout arearefs="checkemptyerror">
				169	<para>Check to make sure the document actually contains something.</para>
				170	</callout>
				171	<callout arearefs="checkroottype">
				172	<para>In our case, we need to make sure the document is the right
				173	type. "story" is the root type of my documents.</para>
				174	</callout>
				175	</calloutlist>
				176	</para>
				177	</sect1>
				178
				179	<sect1 id="xmltutorialgettext">
				180	<title>Retrieving Element Content</title>
				181	<para>Retrieving the content of an element involves traversing the document
				182	tree until you find what you are looking for. In this case, we are looking
				183	for an element called "keyword" contained within element called "story". The
				184	process to find the node we are interested in involves tediously walking the
				185	tree. We assume you already have an xmlDocPtr called <varname>doc</varname>
				186	and an xmlNodPtr called <varname>cur</varname>.</para>
				187
				188	<para>
				189	<programlisting>
				190	<co id="getchildnode" /> cur = cur->xmlChildrenNode;
				191	<co id="huntstoryinfo" /> while (cur != NULL) {
				192	if ((!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo"))){
				193	parseStory (doc, cur);
				194	}
				195
				196	cur = cur->next;
				197	}
				198
				199	</programlisting>
				200
				201	<calloutlist>
				202	<callout arearefs="getchildnode">
				203	<para>Get the first child node of <varname>cur</varname>. At this
				204	point, <varname>cur</varname> points at the document root, which is
				205	the element "story".</para>
				206	</callout>
				207	<callout arearefs="huntstoryinfo">
				208	<para>This loop iterates through the elements that are children of
				209	"story", looking for one called "storyinfo". That
				210	is the element that will contain the "keywords" we are
				211	looking for. It uses the <application>libxml</application> string
				212	comparison
				213	function, <function><ulink
				214	url="http://xmlsoft.org/html/libxml-parser.html#XMLSTRCMP">xmlStrcmp</ulink></function>. If there is a match, it calls the function <function>parseStory</function>.</para>
				215	</callout>
				216	</calloutlist>
				217	</para>
				218
				219	<para>
				220	<programlisting>
				221	void
				222	parseStory (xmlDocPtr doc, xmlNodePtr cur) {
				223
				224	<co id="anothergetchild" /> cur = cur->xmlChildrenNode;
				225	<co id="findkeyword" /> while (cur != NULL) {
				226	if ((!xmlStrcmp(cur->name, (const xmlChar *)"keyword"))) {
				227	<co id="foundkeyword" /> printf("keyword: %s\n", xmlNodeListGetString(doc, cur->xmlChildrenNode, 1));
				228	}
				229	cur = cur->next;
				230	}
				231	return;
				232	}
				233	</programlisting>
				234	<calloutlist>
				235	<callout arearefs="anothergetchild">
				236	<para>Again we get the first child node.</para>
				237	</callout>
				238	<callout arearefs="findkeyword">
				239	<para>Like the loop above, we then iterate through the nodes, looking
				240	for one that matches the element we're interested in, in this case
				241	"keyword".</para>
				242	</callout>
				243	<callout arearefs="foundkeyword">
				244	<para>When we find the "keyword" element, we need to print
				245	its contents. Remember that in <acronym>XML</acronym>, the text
				246	contained within an element is a child node of that element, so we
				247	turn to <varname>cur->xmlChildrenNode</varname>. To retrieve it, we
				248	use the function <function><ulink
				249	url="http://xmlsoft.org/html/libxml-tree.html#XMLNODELISTGETSTRING">xmlNodeListGetString</ulink></function>, which also takes the <varname>doc</varname> pointer as an argument. In this case, we just print it out.</para>
				250	</callout>
				251	</calloutlist>
				252	</para>
				253
				254	</sect1>
				255
				256	<sect1 id="xmltutorialwritingcontent">
				257	<title>Writing element content</title>
				258	<para>Writing element content uses many of the same steps we used above
				259	— parsing the document and walking the tree. We parse the document,
				260	then traverse the tree to find the place we want to insert our element. For
				261	this example, we want to again find the "storyinfo" element and
				262	this time insert a keyword. Then we'll write the file to disk. Full code:
				263	<xref linkend="addkeywordappendix" /></para>
				264
				265	<para>
				266	The main difference in this example is in
				267	<function>parseStory</function>:
				268
				269	<programlisting>
				270	void
				271	parseStory (xmlDocPtr doc, xmlNodePtr cur, char *keyword) {
				272
				273	<co id="addkeyword" /> xmlNewTextChild (cur, NULL, "keyword", keyword);
				274	return;
				275	}
				276	</programlisting>
				277	<calloutlist>
				278	<callout arearefs="addkeyword">
				279	<para>The <function><ulink
				280	url="http://xmlsoft.org/html/libxml-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink></function>
				281	function adds a new child element at the
				282	current node pointer's location in the
				283	tree, specificied by <varname>cur</varname>.</para>
				284	</callout>
				285	</calloutlist>
				286	</para>
				287
				288	<para>
				289	Once the node has been added, we would like to write the document to
				290	file. Is you want the element to have a namespace, you can add it here as
				291	well. In our case, the namespace is NULL.
				292	<programlisting>
				293	xmlSaveFormatFile (docname, doc, 1);
				294	</programlisting>
				295	The first parameter is the name of the file to be written. You'll notice
				296	it is the same as the file we just read. In this case, we just write over
				297	the old file. The second parameter is a pointer to the xmlDoc
				298	structure. Setting the third parameter equal to one ensures indenting on output.
				299	</para>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	300	</sect1>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	301
				302	<sect1 id="xmltutorialwritingattribute">
				303	<title>Writing Attribute</title>
				304	<para>Writing an attribute is similar to writing text to a new element. In
				305	this case, we'll add a reference <acronym>URI</acronym> to our
				306	document. Full code:<xref linkend="addattributeappendix" />.</para>
				307	<para>
				308	A <sgmltag>reference</sgmltag> is a child of the <sgmltag>story</sgmltag>
				309	element, so finding the place to put our new element and attribute is
				310	simple. As soon as we do the error-checking test in our
				311	<function>parseDoc</function>, we are in the right spot to add our
				312	element. But before we do that, we need to make a declaration using a
				313	datatype we have not seen yet:
				314	<programlisting>
				315	xmlAttrPtr newattr;
				316	</programlisting>
				317	We also need an extra xmlNodePtr:
				318	<programlisting>
				319	xmlNodePtr newnode;
				320	</programlisting>
				321	</para>
				322	<para>
				323	The rest of <function>parseDoc</function> is the same as before until we
				324	check to see if our root element is <sgmltag>story</sgmltag>. If it is,
				325	then we know we are at the right spot to add our element:
				326
				327	<programlisting>
				328	<co id="addreferencenode" /> newnode = xmlNewTextChild (cur, NULL, "reference", NULL);
				329	<co id="addattributenode" /> newattr = xmlNewProp (newnode, "uri", uri);
				330	</programlisting>
				331	<calloutlist>
				332	<callout arearefs="addreferencenode">
				333	<para>First we add a new node at the location of the current node
				334	pointer, <varname>cur.</varname> using the <ulink
				335	url="http://xmlsoft.org/html/libxml-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink> function.</para>
				336	</callout>
				337	</calloutlist>
				338	</para>
				339
				340	<para>Once the node is added, the file is written to disk just as in the
				341	previous example in which we added an element with text content.</para>
				342
				343	</sect1>
				344
MDT 2002 John Fleck	5452083	2002-06-13 03:30:26 +0000	[diff] [blame]	345	<sect1 id="xmltutorialattribute">
				346	<title>Retrieving Attributes</title>
				347	<para>Retrieving the value of an attribute is similar to the previous
				348	example in which we retrieved a node's text contents. In this case we'll
				349	extract the value of the <acronym>URI</acronym> we added in the previous
				350	section. Full code: <xref linkend="getattributeappendix" />.</para>
				351	<para>
				352	The initial steps for this example are similar to the previous ones: parse
				353	the doc, find the element you are interested in, then enter a function to
				354	carry out the specific task required. In this case, we call
				355	<function>getReference</function>:
				356	<programlisting>
				357	void
				358	getReference (xmlDocPtr doc, xmlNodePtr cur) {
				359
				360	cur = cur->xmlChildrenNode;
				361	while (cur != NULL) {
				362	if ((!xmlStrcmp(cur->name, (const xmlChar *)"reference"))) {
				363	<co id="getattributevalue" /> printf("uri: %s\n", xmlGetProp(cur, "uri"));
				364	}
				365	cur = cur->next;
				366	}
				367	return;
				368	}
				369	</programlisting>
				370
				371	<calloutlist>
				372	<callout arearefs="getattributevalue">
				373	<para>
				374	The key function is <function><ulink
				375	url="http://xmlsoft.org/html/libxml-tree.html#XMLGETPROP">xmlGetProp</ulink></function>, which returns an
				376	<varname>xmlChar</varname> containing the attribute's value. In this case,
				377	we just print it out.
				378	<note>
				379	<para>
				380	If you are using a <acronym>DTD</acronym> that declares a fixed or
				381	default value for the attribute, this function will retrieve it.
				382	</para>
				383	</note>
				384	</para>
				385	</callout>
				386	</calloutlist>
				387
				388	</para>
				389	</sect1>
				390
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	391	<!--
				392	<appendix id="furtherresources">
				393	<title>Further Resources</title>
				394	<para></para>
				395	</appendix>
				396	-->
				397	<appendix id="sampledoc">
				398	<title>Sample Document</title>
				399	<programlisting>&STORY;</programlisting>
				400	</appendix>
				401	<appendix id="keywordappendix">
				402	<title>Code for Keyword Example</title>
				403	<para>
				404	<programlisting>&KEYWORD;</programlisting>
				405	</para>
				406	</appendix>
				407	<appendix id="addkeywordappendix">
				408	<title>Code for Add Keyword Example</title>
				409	<para>
				410	<programlisting>&ADDKEYWORD;</programlisting>
				411	</para>
				412	</appendix>
				413	<appendix id="addattributeappendix">
				414	<title>Code for Add Attribute Example</title>
				415	<para>
				416	<programlisting>&ADDATTRIBUTE;</programlisting>
				417	</para>
				418	</appendix>
MDT 2002 John Fleck	5452083	2002-06-13 03:30:26 +0000	[diff] [blame]	419	<appendix id="getattributeappendix">
				420	<title>Code for Retrieving Attribute Value Example</title>
				421	<para>
				422	<programlisting>&GETATTRIBUTE;</programlisting>
				423	</para>
				424	</appendix>
MDT 2002 John Fleck	598f6eb	2002-06-04 15:10:36 +0000	[diff] [blame]	425	</article>