Golang x/net: Difference between revisions

From wikinotes
 
(8 intermediate revisions by the same user not shown)
Line 90: Line 90:
== Modifying Parsed HTML ==
== Modifying Parsed HTML ==
<blockquote>
<blockquote>
The <code>Node</code> datastructure uses value objects,<br>
You can mutate <code>Node</code> structs in place,<br>
you cannot simply locate/mutate nodes - you'll need to create and connect new instances.
if adding to children make sure to <code>AppendChild()</code> so it gets added to the array the slice points to.


* <code>atom</code> has constants representing every type of HTML element.
* <code>atom</code> has constants representing every type of HTML element.
Line 99: Line 99:
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
Here's a reusable setup:
Here's a reusable setup:
<syntaxhighlight lang="go">
<syntaxhighlight lang="go">
func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) {
type HTML struct{}
    var err error


    // match current node, return new/modified instances where desired
// recurse through all nodes
    node = adjustHeadNode(node, page)
func (this *HTML) adjust(node *html.Node) (*html.Node, error) {
    node = adjustBodyNode(node, page)
     err := this.adjustAnchorNode(node)
     node = adjustAnchorNode(node)
     if err := nil {
     if err != nil {
         return nil, err
         return nil, err
     }
     }


     // recurse through children
     // recurse through and modify children
    var children []*html.Node
     for child := node.FirstChild; child != nil; child = child.NextSibling {
     for child := node.FirstChild; child != nil; child = child.NextSibling {
         child, err = adjust(child, page)
         err = this.adjust(child, page)
         if err != nil {
         if err != nil {
             return child, err
             return nil, err
        }
        children = append(children, child)
    }
 
    // point Child/Sibling info in structs to the new children
    if len(children) > 0 {
        node.FirstChild = children[0]
        node.LastChild = children[len(children)-1]
    }
    for index, child := range children {
        if 0 < index && index < len(children)-1 {
            child.PrevSibling = children[index-1]
            child.NextSibling = children[index+1]
         }
         }
     }
     }
     return node, nil
     return node, nil
}
}
</syntaxhighlight>


Here's a sample method that mutates a node
<syntaxhighlight lang="go">
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func adjustAnchorNode(node *html.Node) (*html.Node, error) {
func (this *HTML) adjustAnchorNode(node *html.Node) error {
     if node.Type != html.ElementNode {
     if node.Type != html.ElementNode {
         return node, nil
         return nil
     }
     }
     if node.DataAtom != atom.A {
     if node.DataAtom != atom.A {
         return node, nil
         return nil
     }
     }
     var attrs []html.Attribute
     var attrs []html.Attribute
     for _, attr := range node.Attr {
     for _, attr := range node.Attr {
         if attr.Key != "href" {
         if attr.Key != "href" {
             attrs = append(attrs, attr)
             attrs = append(attrs, attr)
            continue
         } else {
         }
             attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
 
        newAttr := html.Attribute{
             Namespace: attr.Namespace,
            Key:      attr.Key,
            Val:      strings.ToLower(attr.Val),
         }
         }
        attrs = append(attrs, newAttr)
     }
     }
    node.Attr = attrs


     return &html.Node{
     return nil
        Parent:      node.Parent,
        FirstChild:  node.FirstChild,
        LastChild:  node.LastChild,
        PrevSibling: node.PrevSibling,
        NextSibling: node.NextSibling,
        Type:        node.Type,
        DataAtom:    node.DataAtom,
        Data:        node.Data,
        Namespace:  node.Namespace,
        Attr:        attrs,
    }, nil
}
}
</syntaxhighlight>
</syntaxhighlight>
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Parsing/Rendering -->
</blockquote><!-- Parsing/Rendering -->

Latest revision as of 19:05, 24 July 2022

The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.

Documentation

official docs https://pkg.go.dev/golang.org/x/net
atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom

Install

go get golang.org/x/net

Components

Nodes, ElementTypes

  • ElementType describes the type of element in the DOM (ex. text, element, doctype, ..)
  • Nodes represent xml-like elements
  • Atoms represent html element types

ElementNodes contain TextNodes

  • ElementNodes represent an HTML element.
  • TextNodes store the value of an HTML element (nested under ElementNodes).
import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"

headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}

header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}

Means the same as

<h1>My Page</h1>

Parsing/Rendering

Basics

import "golang.org/x/net/html"

raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`

// parse html
node, _ := html.Parse(strings.NewReader(raw))

// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'

Modifying Parsed HTML

You can mutate Node structs in place,
if adding to children make sure to AppendChild() so it gets added to the array the slice points to.

  • atom has constants representing every type of HTML element.
  • Nodes keep information about their first/last child
  • Nodes keep information about their siblings (neighbors under same parent)

To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:

type HTML struct{}

// recurse through all nodes
func (this *HTML) adjust(node *html.Node) (*html.Node, error) {
    err := this.adjustAnchorNode(node)
    if err := nil {
        return nil, err
    }

    // recurse through and modify children
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        err = this.adjust(child, page)
        if err != nil {
            return nil, err
        }
    }
    return node, nil
}

// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) error {
    if node.Type != html.ElementNode {
        return nil
    }
    if node.DataAtom != atom.A {
        return nil
    }
    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
        } else {
            attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
        }
    }
    node.Attr = attrs

    return nil
}