Golang encoding/xml: Difference between revisions

From wikinotes
No edit summary
No edit summary
 
(33 intermediate revisions by the same user not shown)
Line 1: Line 1:
go's builtin library for parsing xml.<br>
go's builtin library for parsing xml.<br>
See also: [[golang encoding]], [[xml]]
See also: [[golang encoding]], [[xml]], [[golang x/net]]


A more detailed introduction to go's encoding interface can be seen in [[golang encoding/json]].
A more detailed introduction to go's encoding interface can be seen in [[golang encoding/json]].


{{ NOTE |
Golang's builtin xml library does not support schema validation. }}


= Documentation =
= Documentation =
Line 33: Line 35:
     Skip  `xml:"-"`          // <User></User>
     Skip  `xml:"-"`          // <User></User>
}
}
</syntaxhighlight>
XML namespaces are supported, by providing a prefix to the xml tag.
<syntaxhighlight lang="go">
type Html struct {
    XMLName xml.Name `xml:"http://www.w3.org/1999/xhtml html`
}
// <html xmlns="http://www.w3.org/1999/xhtml">
//  <!-- ... -->
// </html>
</syntaxhighlight>
</syntaxhighlight>


Known issues:
Known issues:
* xml namespaces don't work as expected: https://github.com/golang/go/issues/11496
* schema validation does not appear to be supported
* It doesn't appear that go's internal xml library supports schema validation
</blockquote><!-- Struct Tags -->
</blockquote><!-- Struct Tags -->
= Serialization =
<blockquote>
== Basics ==
<blockquote>
<syntaxhighlight lang="go">
type User struct {
    Id  int    `xml:"id"`
    Name string `xml:"name,attr"`
}
func main() {
    user := User{123, "will"}
    bytes, _ := xml.Marshal(&user)
    fmt.Println(string(bytes))  // <User name="will"><id>123</id></User>
}
</syntaxhighlight>
You can also marshall with an indent
<syntaxhighlight lang="go">
xml.MarshallIndent(
  &user,
  "  ",    // (2) indent entire object this many spaces
  "    ",  // (4) indent-width
)
//| | <-- 2x spaces
//  <User name="will">
//      <id>123</id>    // <-- indented 4x spaces
//  </User>
</syntaxhighlight>
</blockquote><!-- Basics -->
</blockquote><!-- Serializing -->
= Deserialization =
<blockquote>
== Basics ==
<blockquote>
<syntaxhighlight lang="go">
type User struct {
    Id  int    `xml:"id"`
    Name string `xml:"name,attr"`
}
func main() {
    raw := `<User name="will"><id>123</id></User>`
    var user User
    xml.Unmarshall([]byte(raw), &user)
    fmt.Println(user)
}
</syntaxhighlight>
</blockquote><!-- Basics -->
== Non-Homogenous XML ==
<blockquote>
XML is generally not homogenous.<br>
Record each possible sub-element as a field on your object.<br>
If an element can occur multiple times, declare it as an array.<br>
You can ignore elements by not defining fields for them.
<syntaxhighlight lang="go">
encoded := `
  <mediawiki>
      <siteinfo>
          abc
      </siteinfo>
      <page>
          <title>Main Page</title>
      </page>
      <page>
          <title>Linux</title>
      </page>
  </mediawiki>
`
type Result struct {
    XMLName  xml.Name `xml:"mediawiki"` // root node
    SiteInfo string  `xml:"siteinfo"`  // only one 'siteinfo' element under 'mediawiki'
    Page    []Page  `xml:"page"`      // multiple 'page' elements under 'mediawiki'
}
type Page struct {
    Title string `xml:"title"`
}
var result Result
xml.Unmarshall([]byte(encoded), &result)
fmt.Println(result.Page[0].Title.Text)  // Linux
</syntaxhighlight>
</blockquote><!-- Non-Homogenous XML -->
== Arbitrary XML ==
<blockquote>
<syntaxhighlight lang="go">
type Node struct {
XMLName xml.Name
Attrs  []*xml.Attr `xml:",any,attr"` // each attr.Name, attr.Value
Data    string      `xml:",chardata"` // 'Title' in <h1>Title</h1>
Nodes  []*Node    `xml:",any"`      // child-nodes
}
// deserialize
var parsed Node
xml.Unmarshal([]byte(raw), &parsed)
// re-serialize
bytes, err := xml.Marshal(&parsed)
</syntaxhighlight>
{{ expand
| Example: Deserialize, Modify, Re-serialize an arbitrary xml object.
|
{{ WARNING |
Do not parse HTML with XML (elements like <code><nowiki><br/></nowiki></code> are invalid XML).
}}
<syntaxhighlight lang="go">
import (
"encoding/xml"
"fmt"
)
type Node struct {
XMLName xml.Name
Attrs  []*xml.Attr `xml:",any,attr"` // each attr.Name, attr.Value
Data    string      `xml:",chardata"` // 'Title' in <h1>Title</h1>
Nodes  []*Node    `xml:",any"`      // child-nodes
}
func addDotHtmlToAHrefs(node *Node) {
    // Adds a '.html' suffix to each href in a '<a href="foo">foo</a>'
if node.XMLName.Local == "a" {
for _, attr := range node.Attrs {
if attr.Name.Local == "href" {
attr.Value = fmt.Sprint(attr.Value, ".html")
}
}
}
for _, child := range node.Nodes {
addDotHtmlToAHrefs(child)
}
}
func main() {
// define xml
raw := `<html>
          <p><a href="abc">ABC</a></p>
          <blockquote><p><a href="def">DEF</a></p></blockquote>
        </html>`
// unmarshall
var parsed Node
xml.Unmarshal([]byte(raw), &parsed)
// modify
addDotHtmlToAHrefs(&parsed)
// re-marshall, modified
bytes, _ := xml.MarshalIndent(&parsed, "", "  ")
fmt.Println(string(bytes))
}
</syntaxhighlight>
Outputs
<syntaxhighlight lang="html5">
<html>
  <p>
    <a href="abc.html">ABC</a>
  </p>
  <blockquote>
    <p>
      <a href="def.html">DEF</a>
    </p>
  </blockquote>
</html>
</syntaxhighlight>
}}
</blockquote><!-- Arbitrary XML -->
== Custom Deserializers ==
<blockquote>
{{ TODO |
finish }}
<syntaxhighlight lang="go">
func (revision *Revision) UnmarshalXML(d *xml.Decoder, start xml.Startelement) error {
    return nil
}
</syntaxhighlight>
</blockquote><!-- Custom Deserializers -->
</blockquote><!-- Deserialization -->

Latest revision as of 06:07, 10 July 2022

go's builtin library for parsing xml.
See also: golang encoding, xml, golang x/net

A more detailed introduction to go's encoding interface can be seen in golang encoding/json.

NOTE:

Golang's builtin xml library does not support schema validation.

Documentation

official docs https://pkg.go.dev/encoding/xml@go1.18.3

Tutorials

tutorialedge https://tutorialedge.net/golang/parsing-xml-with-golang/

Struct Tags

See full details here

type User struct {
    Name  `xml:"Name"`       // <User><Name>value</Name></User>
    Color `xml:"color,attr"` // <User color="value"></User>
    Skip  `xml:"-"`          // <User></User>
}

XML namespaces are supported, by providing a prefix to the xml tag.

type Html struct {
    XMLName xml.Name `xml:"http://www.w3.org/1999/xhtml html`
}

// <html xmlns="http://www.w3.org/1999/xhtml">
//   <!-- ... -->
// </html>

Known issues:

  • schema validation does not appear to be supported

Serialization

Basics

type User struct {
    Id   int    `xml:"id"`
    Name string `xml:"name,attr"`
}

func main() {
    user := User{123, "will"}
    bytes, _ := xml.Marshal(&user)
    fmt.Println(string(bytes))  // <User name="will"><id>123</id></User>
}

You can also marshall with an indent

xml.MarshallIndent(
  &user,
  "  ",    // (2) indent entire object this many spaces
  "    ",  // (4) indent-width
)

//| | <-- 2x spaces
//  <User name="will">
//      <id>123</id>     // <-- indented 4x spaces
//  </User>

Deserialization

Basics

type User struct {
    Id   int    `xml:"id"`
    Name string `xml:"name,attr"`
}

func main() {
    raw := `<User name="will"><id>123</id></User>`
    var user User
    xml.Unmarshall([]byte(raw), &user)
    fmt.Println(user)
}

Non-Homogenous XML

XML is generally not homogenous.
Record each possible sub-element as a field on your object.
If an element can occur multiple times, declare it as an array.
You can ignore elements by not defining fields for them.

encoded := `
  <mediawiki>
      <siteinfo>
          abc
      </siteinfo>
      <page>
          <title>Main Page</title>
      </page>
      <page>
          <title>Linux</title>
      </page>
  </mediawiki>
`

type Result struct {
    XMLName  xml.Name `xml:"mediawiki"` // root node
    SiteInfo string   `xml:"siteinfo"`  // only one 'siteinfo' element under 'mediawiki'
    Page     []Page   `xml:"page"`      // multiple 'page' elements under 'mediawiki'
}

type Page struct {
    Title string `xml:"title"`
}

var result Result
xml.Unmarshall([]byte(encoded), &result)
fmt.Println(result.Page[0].Title.Text)  // Linux

Arbitrary XML

type Node struct {
	XMLName xml.Name
	Attrs   []*xml.Attr `xml:",any,attr"` // each attr.Name, attr.Value
	Data    string      `xml:",chardata"` // 'Title' in <h1>Title</h1>
	Nodes   []*Node     `xml:",any"`      // child-nodes
}

// deserialize
var parsed Node
xml.Unmarshal([]byte(raw), &parsed)

// re-serialize
bytes, err := xml.Marshal(&parsed)

Example: Deserialize, Modify, Re-serialize an arbitrary xml object.


WARNING:

Do not parse HTML with XML (elements like <br/> are invalid XML).

import (
	"encoding/xml"
	"fmt"
)

type Node struct {
	XMLName xml.Name
	Attrs   []*xml.Attr `xml:",any,attr"` // each attr.Name, attr.Value
	Data    string      `xml:",chardata"` // 'Title' in <h1>Title</h1>
	Nodes   []*Node     `xml:",any"`      // child-nodes
}

func addDotHtmlToAHrefs(node *Node) {
    // Adds a '.html' suffix to each href in a '<a href="foo">foo</a>'
	if node.XMLName.Local == "a" {
		for _, attr := range node.Attrs {
			if attr.Name.Local == "href" {
				attr.Value = fmt.Sprint(attr.Value, ".html")
			}
		}
	}

	for _, child := range node.Nodes {
		addDotHtmlToAHrefs(child)
	}
}

func main() {
	// define xml
	raw := `<html>
          <p><a href="abc">ABC</a></p>
          <blockquote><p><a href="def">DEF</a></p></blockquote>
        </html>`

	// unmarshall
	var parsed Node
	xml.Unmarshal([]byte(raw), &parsed)

	// modify
	addDotHtmlToAHrefs(&parsed)

	// re-marshall, modified
	bytes, _ := xml.MarshalIndent(&parsed, "", "  ")
	fmt.Println(string(bytes))
}

Outputs

<html>
  <p>
    <a href="abc.html">ABC</a>
  </p>
  <blockquote>
    <p>
      <a href="def.html">DEF</a>
    </p>
  </blockquote>
</html>


Custom Deserializers

TODO:

finish

func (revision *Revision) UnmarshalXML(d *xml.Decoder, start xml.Startelement) error {
    return nil
}