Golang x/net: Difference between revisions

From wikinotes
Line 100: Line 100:
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
To iterate through children, start at the node's first-child, and loop through it's siblings.<br>
Here's a reusable setup:
Here's a reusable setup:
<syntaxhighlight lang="go">
<syntaxhighlight lang="go">
func adjust(node *html.Node, page *mwdump.Page) (*html.Node, error) {
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
     var err error
func adjustAnchorNode(node *html.Node) (*html.Node, error) {
 
     // modify this node (if appropriate)
    // match current node, return new/modified instances where desired
     err := this.adjustAnchorNode(node)
    node = adjustHeadNode(node, page)
     if err := nil {
     node = adjustBodyNode(node, page)
    node = adjustAnchorNode(node)
     if err != nil {
         return nil, err
         return nil, err
     }
     }


     // recurse through children
     // recurse through and modify children
    var children []*html.Node
     for child := node.FirstChild; child != nil; child = child.NextSibling {
     for child := node.FirstChild; child != nil; child = child.NextSibling {
         child, err = adjust(child, page)
         this.adjust(child, page)
        if err != nil {
            return child, err
        }
        children = append(children, child)
    }
 
    // point Child/Sibling info in structs to the new children
    if len(children) > 0 {
        node.FirstChild = children[0]
        node.LastChild = children[len(children)-1]
    }
    for index, child := range children {
        if 0 < index && index < len(children)-1 {
            child.PrevSibling = children[index-1]
            child.NextSibling = children[index+1]
        }
     }
     }
    return node, nil
}
}
</syntaxhighlight>
</syntaxhighlight>


Here's a sample method that mutates a node
Here's a sample method that mutates a node
<syntaxhighlight lang="go">
<syntaxhighlight lang="go">
// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func (this *HTML) adjustAnchorNode(node *html.Node) error {
func adjustAnchorNode(node *html.Node) (*html.Node, error) {
     if node.Type != html.ElementNode {
     if node.Type != html.ElementNode {
         return node, nil
         return nil
     }
     }
     if node.DataAtom != atom.A {
     if node.DataAtom != atom.A {
         return node, nil
         return nil
     }
     }
     var attrs []html.Attribute
     var attrs []html.Attribute
     for _, attr := range node.Attr {
     for _, attr := range node.Attr {
         if attr.Key != "href" {
         if attr.Key != "href" {
             attrs = append(attrs, attr)
             attrs = append(attrs, attr)
            continue
         } else {
         }
             attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
 
        newAttr := html.Attribute{
             Namespace: attr.Namespace,
            Key:      attr.Key,
            Val:      strings.ToLower(attr.Val),
         }
         }
        attrs = append(attrs, newAttr)
     }
     }
    node.Attr = attrs


     return &html.Node{
     return nil
        Parent:      node.Parent,
        FirstChild:  node.FirstChild,
        LastChild:  node.LastChild,
        PrevSibling: node.PrevSibling,
        NextSibling: node.NextSibling,
        Type:        node.Type,
        DataAtom:    node.DataAtom,
        Data:        node.Data,
        Namespace:  node.Namespace,
        Attr:        attrs,
    }, nil
}
}
</syntaxhighlight>
</syntaxhighlight>
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Modifying Parsed HTML -->
</blockquote><!-- Parsing/Rendering -->
</blockquote><!-- Parsing/Rendering -->

Revision as of 18:59, 24 July 2022

The official library for parsing HTML.
It is not shipped with go's standard library, but it is maintained by the go developers.

Documentation

official docs https://pkg.go.dev/golang.org/x/net
atom.Atom constants (element types) https://pkg.go.dev/golang.org/x/net@v0.0.0-20220706163947-c90051bbdb60/html/atom#Atom

Install

go get golang.org/x/net

Components

Nodes, ElementTypes

  • ElementType describes the type of element in the DOM (ex. text, element, doctype, ..)
  • Nodes represent xml-like elements
  • Atoms represent html element types

ElementNodes contain TextNodes

  • ElementNodes represent an HTML element.
  • TextNodes store the value of an HTML element (nested under ElementNodes).
import "golang.org/x/net/html"
import "golang.org/x/net/html/atom"

headerVal := html.Node{
    Type: html.TextNode,
    Data: "My Page",
}

header := html.Node{
    Type: html.ElementNode,
    DataAtom: atom.H1,
    Data: "h1",
    FirstChild: &headerVal,
    LastChild: &headerVal,
}

Means the same as

<h1>My Page</h1>

Parsing/Rendering

Basics

import "golang.org/x/net/html"

raw := `
    <html>
      <head>
        <title>foo</title>
      </head>
      <body>
        <h1>Foo</h1>
        <p>hello world</p>
      </body>
    </html>`

// parse html
node, _ := html.Parse(strings.NewReader(raw))

// render html
var render strings.Builder
html.Render(&render, node)
render.String()              // '<html><head>...'

Modifying Parsed HTML

The Node datastructure uses value objects,
you can mutate nodes,
if adding to children make sure to AppendChild() so it gets added to the array the slice points to.

  • atom has constants representing every type of HTML element.
  • Nodes keep information about their first/last child
  • Nodes keep information about their siblings (neighbors under same parent)

To iterate through children, start at the node's first-child, and loop through it's siblings.
Here's a reusable setup:

// lower-cases all 'href' links in a '<a href="Foo/Bar">'
func adjustAnchorNode(node *html.Node) (*html.Node, error) {
    // modify this node (if appropriate)
    err := this.adjustAnchorNode(node)
    if err := nil {
        return nil, err
    }

    // recurse through and modify children
    for child := node.FirstChild; child != nil; child = child.NextSibling {
        this.adjust(child, page)
    }
}


Here's a sample method that mutates a node

func (this *HTML) adjustAnchorNode(node *html.Node) error {
    if node.Type != html.ElementNode {
        return nil
    }
    if node.DataAtom != atom.A {
        return nil
    }
    var attrs []html.Attribute
    for _, attr := range node.Attr {
        if attr.Key != "href" {
            attrs = append(attrs, attr)
        } else {
            attrs = append(attrs, strings.ToLower(attr.Val))  // <-- modify attr
        }
    }
    node.Attr = attrs

    return nil
}