Golang x/net: Difference between revisions

From wikinotes
No edit summary
Line 20: Line 20:
</blockquote><!-- Install -->
</blockquote><!-- Install -->


= Basics =
= Parsing/Rendering =
<blockquote>
<blockquote>
== Basics ==
<blockquote>
<syntaxhighlight lang="bash">
import "golang.org/x/net/html"
raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`
// parse html
node, _ := html.Parse(strings.NewReader(raw))
// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'
</syntaxhighlight>
</blockquote><!-- Basics -->
== Modifying Parsed HTML ==
<blockquote>
The <code>Node</code> datastructure uses value objects,<br>
you cannot simply locate/mutate nodes - you'll need to create and connect new instances.
* <code>atom</code> has constants representing every type of HTML element.
* Nodes keep information about their first/last child
* Nodes keep information about their siblings (neighbors under same parent)
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
Here's a reusable setup:
<syntaxhighlight lang="go">
func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) {
    var err error
    // match current node, return new/modified instances where desired
    node = this.adjustHeadNode(node, page)
    node = this.adjustBodyNode(node, page)
    node = this.adjustAnchorNode(node)
    if err != nil {
        return nil, err
    }
    // recurse through children
    var children []*html.Node
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        child, err = this.adjust(child, page)
        if err != nil {
            return child, err
        }
        children = append(children, child)
    }
    // point Child/Sibling info in structs to the new children
    if len(children) > 0 {
        node.FirstChild = children[0]
        node.LastChild = children[len(children)-1]
    }
    for index, child := range children {
        if 0 < index && index < len(children)-1 {
            child.PrevSibling = children[index-1]
            child.NextSibling = children[index+1]
        }
    }
    return node, nil
}
</syntaxhighlight>
Here's a sample method that mutates a node
<syntaxhighlight lang="go">
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) (finalNode *html.Node, err error) {
    if node.Type != html.ElementNode {
        return node, nil
    }
    if node.DataAtom != atom.A {
        return node, nil
    }
    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
            continue
        }
        newAttr := html.Attribute{
            Namespace: attr.Namespace,
            Key:      attr.Key,
            Val:      strings.ToLower(attr.Val),
        }
        attrs = append(attrs, newAttr)
    }
    return &html.Node{
        Parent:      node.Parent,
        FirstChild:  node.FirstChild,
        LastChild:  node.LastChild,
        PrevSibling: node.PrevSibling,
        NextSibling: node.NextSibling,
        Type:        node.Type,
        DataAtom:    node.DataAtom,
        Data:        node.Data,
        Namespace:  node.Namespace,
        Attr:        attrs,
    }, nil
}
</syntaxhighlight>
<syntaxhighlight lang="go">
</syntaxhighlight>
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Parsing/Rendering -->
= Components =
<blockquote>
== ElementNodes vs TextNodes ==
<blockquote>
<code>ElementNode</code>s represent an HTML element.<br>
They do not store their value, this is deferred to a <code>TextNode</code>
<syntaxhighlight lang="go">
<syntaxhighlight lang="go">
import "golang.org/x/net/html/atom" // HTML element types
import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"
 
headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}
 
header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}
</syntaxhighlight>
</syntaxhighlight>
</blockquote><!-- ElementNodes vs TextNodes -->
</blockquote><!-- Components -->
</blockquote><!-- Basics -->
</blockquote><!-- Basics -->

Revision as of 18:37, 10 July 2022

The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.

Documentation

official docs https://pkg.go.dev/golang.org/x/net
atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom

Install

go get golang.org/x/net

Parsing/Rendering

Basics

import "golang.org/x/net/html"

raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`

// parse html
node, _ := html.Parse(strings.NewReader(raw))

// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'

Modifying Parsed HTML

The Node datastructure uses value objects,
you cannot simply locate/mutate nodes - you'll need to create and connect new instances.

  • atom has constants representing every type of HTML element.
  • Nodes keep information about their first/last child
  • Nodes keep information about their siblings (neighbors under same parent)

To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:

func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) {
    var err error

    // match current node, return new/modified instances where desired
    node = this.adjustHeadNode(node, page)
    node = this.adjustBodyNode(node, page)
    node = this.adjustAnchorNode(node)
    if err != nil {
        return nil, err
    }

    // recurse through children
    var children []*html.Node
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        child, err = this.adjust(child, page)
        if err != nil {
            return child, err
        }
        children = append(children, child)
    }

    // point Child/Sibling info in structs to the new children
    if len(children) > 0 {
        node.FirstChild = children[0]
        node.LastChild = children[len(children)-1]
    }
    for index, child := range children {
        if 0 < index && index < len(children)-1 {
            child.PrevSibling = children[index-1]
            child.NextSibling = children[index+1]
        }
    }

    return node, nil
}

Here's a sample method that mutates a node

// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) (finalNode *html.Node, err error) {
    if node.Type != html.ElementNode {
        return node, nil
    }
    if node.DataAtom != atom.A {
        return node, nil
    }

    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
            continue
        }

        newAttr := html.Attribute{
            Namespace: attr.Namespace,
            Key:       attr.Key,
            Val:       strings.ToLower(attr.Val),
        }
        attrs = append(attrs, newAttr)
    }

    return &html.Node{
        Parent:      node.Parent,
        FirstChild:  node.FirstChild,
        LastChild:   node.LastChild,
        PrevSibling: node.PrevSibling,
        NextSibling: node.NextSibling,
        Type:        node.Type,
        DataAtom:    node.DataAtom,
        Data:        node.Data,
        Namespace:   node.Namespace,
        Attr:        attrs,
    }, nil
}

Components

ElementNodes vs TextNodes

ElementNodes represent an HTML element.
They do not store their value, this is deferred to a TextNode

import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"

headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}

header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}