Golang x/net: Difference between revisions
From wikinotes
No edit summary |
(→Basics) |
||
Line 20: | Line 20: | ||
</blockquote><!-- Install --> | </blockquote><!-- Install --> | ||
= | = Parsing/Rendering = | ||
<blockquote> | <blockquote> | ||
== Basics == | |||
<blockquote> | |||
<syntaxhighlight lang="bash"> | |||
import "golang.org/x/net/html" | |||
raw := ` | |||
<html> | |||
<head> | |||
<title>foo</title> | |||
</head> | |||
<body> | |||
<h1>Foo</h1> | |||
<p>hello world</p> | |||
</body> | |||
</html>` | |||
// parse html | |||
node, _ := html.Parse(strings.NewReader(raw)) | |||
// render html | |||
var render strings.Builder | |||
html.Render(&render, node) | |||
render.String() // '<html><head>...' | |||
</syntaxhighlight> | |||
</blockquote><!-- Basics --> | |||
== Modifying Parsed HTML == | |||
<blockquote> | |||
The <code>Node</code> datastructure uses value objects,<br> | |||
you cannot simply locate/mutate nodes - you'll need to create and connect new instances. | |||
* <code>atom</code> has constants representing every type of HTML element. | |||
* Nodes keep information about their first/last child | |||
* Nodes keep information about their siblings (neighbors under same parent) | |||
To iterate through children, start at the node's first-child, and loop through it's siblings.<br> | |||
Here's a reusable setup: | |||
<syntaxhighlight lang="go"> | |||
func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) { | |||
var err error | |||
// match current node, return new/modified instances where desired | |||
node = this.adjustHeadNode(node, page) | |||
node = this.adjustBodyNode(node, page) | |||
node = this.adjustAnchorNode(node) | |||
if err != nil { | |||
return nil, err | |||
} | |||
// recurse through children | |||
var children []*html.Node | |||
for child := node.FirstChild; child != nil; child = child.NextSibling { | |||
child, err = this.adjust(child, page) | |||
if err != nil { | |||
return child, err | |||
} | |||
children = append(children, child) | |||
} | |||
// point Child/Sibling info in structs to the new children | |||
if len(children) > 0 { | |||
node.FirstChild = children[0] | |||
node.LastChild = children[len(children)-1] | |||
} | |||
for index, child := range children { | |||
if 0 < index && index < len(children)-1 { | |||
child.PrevSibling = children[index-1] | |||
child.NextSibling = children[index+1] | |||
} | |||
} | |||
return node, nil | |||
} | |||
</syntaxhighlight> | |||
Here's a sample method that mutates a node | |||
<syntaxhighlight lang="go"> | |||
// lower-cases all 'href' links in a '<a href="Foo/Bar">' | |||
func (this *HTML) adjustAnchorNode(node *html.Node) (finalNode *html.Node, err error) { | |||
if node.Type != html.ElementNode { | |||
return node, nil | |||
} | |||
if node.DataAtom != atom.A { | |||
return node, nil | |||
} | |||
var attrs []html.Attribute | |||
for _, attr := range node.Attr { | |||
if attr.Key != "href" { | |||
attrs = append(attrs, attr) | |||
continue | |||
} | |||
newAttr := html.Attribute{ | |||
Namespace: attr.Namespace, | |||
Key: attr.Key, | |||
Val: strings.ToLower(attr.Val), | |||
} | |||
attrs = append(attrs, newAttr) | |||
} | |||
return &html.Node{ | |||
Parent: node.Parent, | |||
FirstChild: node.FirstChild, | |||
LastChild: node.LastChild, | |||
PrevSibling: node.PrevSibling, | |||
NextSibling: node.NextSibling, | |||
Type: node.Type, | |||
DataAtom: node.DataAtom, | |||
Data: node.Data, | |||
Namespace: node.Namespace, | |||
Attr: attrs, | |||
}, nil | |||
} | |||
</syntaxhighlight> | |||
<syntaxhighlight lang="go"> | |||
</syntaxhighlight> | |||
</blockquote><!-- Modifying Parsed HTML --> | |||
</blockquote><!-- Parsing/Rendering --> | |||
= Components = | |||
<blockquote> | |||
== ElementNodes vs TextNodes == | |||
<blockquote> | |||
<code>ElementNode</code>s represent an HTML element.<br> | |||
They do not store their value, this is deferred to a <code>TextNode</code> | |||
<syntaxhighlight lang="go"> | <syntaxhighlight lang="go"> | ||
import "golang.org/x/net/html/atom" | import "golang.org/x/net/html" | ||
import "golang.org/x/net/html/atom" | |||
headerVal := html.Node{ | |||
Type: html.TextNode, | |||
Data: "My Page", | |||
} | |||
header := html.Node{ | |||
Type: html.ElementNode, | |||
DataAtom: atom.H1, | |||
Data: "h1", | |||
FirstChild: &headerVal, | |||
LastChild: &headerVal, | |||
} | |||
</syntaxhighlight> | </syntaxhighlight> | ||
</blockquote><!-- ElementNodes vs TextNodes --> | |||
</blockquote><!-- Components --> | |||
</blockquote><!-- Basics --> | </blockquote><!-- Basics --> |
Revision as of 18:37, 10 July 2022
The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.
Documentation
official docs https://pkg.go.dev/golang.org/x/net atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom
Install
go get golang.org/x/net
Parsing/Rendering
Basics
import "golang.org/x/net/html" raw := ` <html> <head> <title>foo</title> </head> <body> <h1>Foo</h1> <p>hello world</p> </body> </html>` // parse html node, _ := html.Parse(strings.NewReader(raw)) // render html var render strings.Builder html.Render(&render, node) render.String() // '<html><head>...'Modifying Parsed HTML
The
Node
datastructure uses value objects,
you cannot simply locate/mutate nodes - you'll need to create and connect new instances.
atom
has constants representing every type of HTML element.- Nodes keep information about their first/last child
- Nodes keep information about their siblings (neighbors under same parent)
To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) { var err error // match current node, return new/modified instances where desired node = this.adjustHeadNode(node, page) node = this.adjustBodyNode(node, page) node = this.adjustAnchorNode(node) if err != nil { return nil, err } // recurse through children var children []*html.Node for child := node.FirstChild; child != nil; child = child.NextSibling { child, err = this.adjust(child, page) if err != nil { return child, err } children = append(children, child) } // point Child/Sibling info in structs to the new children if len(children) > 0 { node.FirstChild = children[0] node.LastChild = children[len(children)-1] } for index, child := range children { if 0 < index && index < len(children)-1 { child.PrevSibling = children[index-1] child.NextSibling = children[index+1] } } return node, nil }Here's a sample method that mutates a node
// lower-cases all 'href' links in a '<a href="Foo/Bar">' func (this *HTML) adjustAnchorNode(node *html.Node) (finalNode *html.Node, err error) { if node.Type != html.ElementNode { return node, nil } if node.DataAtom != atom.A { return node, nil } var attrs []html.Attribute for _, attr := range node.Attr { if attr.Key != "href" { attrs = append(attrs, attr) continue } newAttr := html.Attribute{ Namespace: attr.Namespace, Key: attr.Key, Val: strings.ToLower(attr.Val), } attrs = append(attrs, newAttr) } return &html.Node{ Parent: node.Parent, FirstChild: node.FirstChild, LastChild: node.LastChild, PrevSibling: node.PrevSibling, NextSibling: node.NextSibling, Type: node.Type, DataAtom: node.DataAtom, Data: node.Data, Namespace: node.Namespace, Attr: attrs, }, nil }
Components
ElementNodes vs TextNodes
ElementNode
s represent an HTML element.
They do not store their value, this is deferred to aTextNode
import "golang.org/x/net/html" import "golang.org/x/net/html/atom" headerVal := html.Node{ Type: html.TextNode, Data: "My Page", } header := html.Node{ Type: html.ElementNode, DataAtom: atom.H1, Data: "h1", FirstChild: &headerVal, LastChild: &headerVal, }