Golang x/net: Difference between revisions

From wikinotes
(Created page with "The official library for parsing HTML.<br> It is not shipped with go's standard library, but it is maintained by the go developers. = Documentation = <blockquote> {| class="wikitable" |- | official docs || https://pkg.go.dev/golang.org/x/net/html |- |} </blockquote><!-- Documentation -->")
 
 
(21 intermediate revisions by the same user not shown)
Line 6: Line 6:
{| class="wikitable"
{| class="wikitable"
|-
|-
| official docs || https://pkg.go.dev/golang.org/x/net/html
| official docs || https://pkg.go.dev/golang.org/x/net
|-
| atom.Atom constants (element types) || https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom
|-
|-
|}
|}
</blockquote><!-- Documentation -->
</blockquote><!-- Documentation -->
= Install =
<blockquote>
<syntaxhighlight lang="go">
go get golang.org/x/net
</syntaxhighlight>
</blockquote><!-- Install -->
= Components =
<blockquote>
== Nodes, ElementTypes ==
<blockquote>
* [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html#NodeType ElementType] describes the type of element in the DOM (ex. text, element, doctype, ..)
* [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html#Node Node]s represent xml-like elements
* [https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom Atom]s represent html element types
</blockquote><!-- Nodes, ElementTypes -->
== ElementNodes contain TextNodes ==
<blockquote>
* <code>ElementNode</code>s represent an HTML element.
* <code>TextNode</code>s store the value of an HTML element (nested under ElementNodes).
<syntaxhighlight lang="go">
import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"
headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}
header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}
</syntaxhighlight>
Means the same as
<syntaxhighlight lang="html5">
<h1>My Page</h1>
</syntaxhighlight>
</blockquote><!-- ElementNodes vs TextNodes -->
</blockquote><!-- Components -->
= Parsing/Rendering =
<blockquote>
== Basics ==
<blockquote>
<syntaxhighlight lang="go">
import "golang.org/x/net/html"
raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`
// parse html
node, _ := html.Parse(strings.NewReader(raw))
// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'
</syntaxhighlight>
</blockquote><!-- Basics -->
== Modifying Parsed HTML ==
<blockquote>
You can mutate <code>Node</code> structs in place,<br>
if adding to children make sure to <code>AppendChild()</code> so it gets added to the array the slice points to.
* <code>atom</code> has constants representing every type of HTML element.
* Nodes keep information about their first/last child
* Nodes keep information about their siblings (neighbors under same parent)
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
Here's a reusable setup:
<syntaxhighlight lang="go">
type HTML struct{}
// recurse through all nodes
func (this *HTML) adjust(node *html.Node) (*html.Node, error) {
    err := this.adjustAnchorNode(node)
    if err := nil {
        return nil, err
    }
    // recurse through and modify children
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        err = this.adjust(child, page)
        if err != nil {
            return nil, err
        }
    }
    return node, nil
}
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) error {
    if node.Type != html.ElementNode {
        return nil
    }
    if node.DataAtom != atom.A {
        return nil
    }
    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
        } else {
            attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
        }
    }
    node.Attr = attrs
    return nil
}
</syntaxhighlight>
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Parsing/Rendering -->

Latest revision as of 19:05, 24 July 2022

The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.

Documentation

official docs https://pkg.go.dev/golang.org/x/net
atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom

Install

go get golang.org/x/net

Components

Nodes, ElementTypes

  • ElementType describes the type of element in the DOM (ex. text, element, doctype, ..)
  • Nodes represent xml-like elements
  • Atoms represent html element types

ElementNodes contain TextNodes

  • ElementNodes represent an HTML element.
  • TextNodes store the value of an HTML element (nested under ElementNodes).
import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"

headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}

header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}

Means the same as

<h1>My Page</h1>

Parsing/Rendering

Basics

import "golang.org/x/net/html"

raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`

// parse html
node, _ := html.Parse(strings.NewReader(raw))

// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'

Modifying Parsed HTML

You can mutate Node structs in place,
if adding to children make sure to AppendChild() so it gets added to the array the slice points to.

  • atom has constants representing every type of HTML element.
  • Nodes keep information about their first/last child
  • Nodes keep information about their siblings (neighbors under same parent)

To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:

type HTML struct{}

// recurse through all nodes
func (this *HTML) adjust(node *html.Node) (*html.Node, error) {
    err := this.adjustAnchorNode(node)
    if err := nil {
        return nil, err
    }

    // recurse through and modify children
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        err = this.adjust(child, page)
        if err != nil {
            return nil, err
        }
    }
    return node, nil
}

// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) error {
    if node.Type != html.ElementNode {
        return nil
    }
    if node.DataAtom != atom.A {
        return nil
    }
    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
        } else {
            attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
        }
    }
    node.Attr = attrs

    return nil
}