Golang x/net: Difference between revisions
From wikinotes
No edit summary |
|||
(12 intermediate revisions by the same user not shown) | |||
Line 22: | Line 22: | ||
= Components = | = Components = | ||
<blockquote> | <blockquote> | ||
== | == Nodes, ElementTypes == | ||
<blockquote> | <blockquote> | ||
<code>ElementNode</code>s represent an HTML element. | * [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html#NodeType ElementType] describes the type of element in the DOM (ex. text, element, doctype, ..) | ||
* [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html#Node Node]s represent xml-like elements | |||
* [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom Atom]s represent html element types | |||
</blockquote><!-- Nodes, ElementTypes --> | |||
== ElementNodes contain TextNodes == | |||
<blockquote> | |||
* <code>ElementNode</code>s represent an HTML element. | |||
* <code>TextNode</code>s store the value of an HTML element (nested under ElementNodes). | |||
<syntaxhighlight lang="go"> | <syntaxhighlight lang="go"> | ||
Line 44: | Line 51: | ||
LastChild: &headerVal, | LastChild: &headerVal, | ||
} | } | ||
</syntaxhighlight> | |||
Means the same as | |||
<syntaxhighlight lang="html5"> | |||
<h1>My Page</h1> | |||
</syntaxhighlight> | </syntaxhighlight> | ||
</blockquote><!-- ElementNodes vs TextNodes --> | </blockquote><!-- ElementNodes vs TextNodes --> | ||
Line 78: | Line 90: | ||
== Modifying Parsed HTML == | == Modifying Parsed HTML == | ||
<blockquote> | <blockquote> | ||
You can mutate <code>Node</code> structs in place,<br> | |||
if adding to children make sure to <code>AppendChild()</code> so it gets added to the array the slice points to. | |||
* <code>atom</code> has constants representing every type of HTML element. | * <code>atom</code> has constants representing every type of HTML element. | ||
Line 87: | Line 99: | ||
To iterate through children, start at the node's first-child, and loop through it's siblings.<br> | To iterate through children, start at the node's first-child, and loop through it's siblings.<br> | ||
Here's a reusable setup: | Here's a reusable setup: | ||
<syntaxhighlight lang="go"> | <syntaxhighlight lang="go"> | ||
type HTML struct{} | |||
// recurse through all nodes | |||
func (this *HTML) adjust(node *html.Node) (*html.Node, error) { | |||
err := this.adjustAnchorNode(node) | |||
if err := nil { | |||
if err | |||
return nil, err | return nil, err | ||
} | } | ||
// recurse through children | // recurse through and modify children | ||
for child := node.FirstChild; child != nil; child = child.NextSibling { | for child := node.FirstChild; child != nil; child = child.NextSibling { | ||
err = this.adjust(child, page) | |||
if err != nil { | if err != nil { | ||
return | return nil, err | ||
} | } | ||
} | } | ||
return node, nil | return node, nil | ||
} | } | ||
// lower-cases all 'href' links in a '<a href="Foo/Bar">' | // lower-cases all 'href' links in a '<a href="Foo/Bar">' | ||
func adjustAnchorNode(node *html.Node) | func (this *HTML) adjustAnchorNode(node *html.Node) error { | ||
if node.Type != html.ElementNode { | if node.Type != html.ElementNode { | ||
return | return nil | ||
} | } | ||
if node.DataAtom != atom.A { | if node.DataAtom != atom.A { | ||
return | return nil | ||
} | } | ||
var attrs []html.Attribute | var attrs []html.Attribute | ||
for _, attr := range node.Attr { | for _, attr := range node.Attr { | ||
if attr.Key != "href" { | if attr.Key != "href" { | ||
attrs = append(attrs, attr) | attrs = append(attrs, attr) | ||
} else { | |||
} | attrs = append(attrs, strings.ToLower(attr.Val)) // <-- modify attr | ||
} | } | ||
} | } | ||
node.Attr = attrs | |||
return | return nil | ||
} | } | ||
</syntaxhighlight> | </syntaxhighlight> | ||
</blockquote><!-- Modifying Parsed HTML --> | </blockquote><!-- Modifying Parsed HTML --> | ||
</blockquote><!-- Parsing/Rendering --> | </blockquote><!-- Parsing/Rendering --> |
Latest revision as of 19:05, 24 July 2022
The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.
Documentation
official docs https://pkg.go.dev/golang.org/x/net atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom
Install
go get golang.org/x/net
Components
Nodes, ElementTypes
- ElementType describes the type of element in the DOM (ex. text, element, doctype, ..)
- Nodes represent xml-like elements
- Atoms represent html element types
ElementNodes contain TextNodes
ElementNode
s represent an HTML element.TextNode
s store the value of an HTML element (nested under ElementNodes).import "golang.org/x/net/html" import "golang.org/x/net/html/atom" headerVal := html.Node{ Type: html.TextNode, Data: "My Page", } header := html.Node{ Type: html.ElementNode, DataAtom: atom.H1, Data: "h1", FirstChild: &headerVal, LastChild: &headerVal, }Means the same as
<h1>My Page</h1>
Parsing/Rendering
Basics
import "golang.org/x/net/html" raw := ` <html> <head> <title>foo</title> </head> <body> <h1>Foo</h1> <p>hello world</p> </body> </html>` // parse html node, _ := html.Parse(strings.NewReader(raw)) // render html var render strings.Builder html.Render(&render, node) render.String() // '<html><head>...'Modifying Parsed HTML
You can mutate
Node
structs in place,
if adding to children make sure toAppendChild()
so it gets added to the array the slice points to.
atom
has constants representing every type of HTML element.- Nodes keep information about their first/last child
- Nodes keep information about their siblings (neighbors under same parent)
To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:type HTML struct{} // recurse through all nodes func (this *HTML) adjust(node *html.Node) (*html.Node, error) { err := this.adjustAnchorNode(node) if err := nil { return nil, err } // recurse through and modify children for child := node.FirstChild; child != nil; child = child.NextSibling { err = this.adjust(child, page) if err != nil { return nil, err } } return node, nil } // lower-cases all 'href' links in a '<a href="Foo/Bar">' func (this *HTML) adjustAnchorNode(node *html.Node) error { if node.Type != html.ElementNode { return nil } if node.DataAtom != atom.A { return nil } var attrs []html.Attribute for _, attr := range node.Attr { if attr.Key != "href" { attrs = append(attrs, attr) } else { attrs = append(attrs, strings.ToLower(attr.Val)) // <-- modify attr } } node.Attr = attrs return nil }