Documentation
¶
Overview ¶
Package html2data - extract data from HTML via CSS selectors
Install package and command line utility:
go get -u github.com/msoap/html2data/cmd/html2data
Install package only:
go get -u github.com/msoap/html2data
Allowed pseudo-selectors:
:attr(attr_name) - for getting attributes instead text
:html - for getting HTML instead text
:get(N) - get n-th element from list
Command line utility:
html2data URL "css selector" html2data file.html "css selector" cat file.html | html2data "css selector"
Example ¶
package main
import (
"fmt"
"log"
"github.com/msoap/html2data"
)
func main() {
doc := html2data.FromURL("http://example.com")
// or with config
// doc := FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: true})
if doc.Err != nil {
log.Fatal(doc.Err)
}
// get title
title, _ := doc.GetDataSingle("title")
fmt.Println("Title is:", title)
title, _ = doc.GetDataSingle("title", html2data.Cfg{DontTrimSpaces: true})
fmt.Println("Title as is, with spaces:", title)
texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"})
// get all H1 headers:
if textOne, ok := texts["h1"]; ok {
for _, text := range textOne {
fmt.Println(text)
}
}
// get all urls from links
if links, ok := texts["links"]; ok {
for _, text := range links {
fmt.Println(text)
}
}
}
Index ¶
- type CSSSelector
- type Cfg
- type Doc
- func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
- func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
- func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
- func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
- func (doc Doc) GetDataSingle(selector string, configs ...Cfg) (result string, err error)
- type URLCfg
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CSSSelector ¶
type CSSSelector struct {
// contains filtered or unexported fields
}
CSSSelector - selector with settings
type Cfg ¶
type Cfg struct {
DontTrimSpaces bool // get text as is, by default trim spaces
}
Cfg - config for GetData* methods
type Doc ¶
type Doc struct {
Err error
// contains filtered or unexported fields
}
Doc - html document for parse
func FromFile ¶
FromFile - get doc from file
Example ¶
package main
import (
"log"
"github.com/msoap/html2data"
)
func main() {
doc := html2data.FromFile("file_name.html")
if doc.Err != nil {
log.Fatal(doc.Err)
}
}
func FromReader ¶
FromReader - get doc from io.Reader
Example ¶
package main
import (
"bufio"
"log"
"os"
"github.com/msoap/html2data"
)
func main() {
doc := html2data.FromReader(bufio.NewReader(os.Stdin))
if doc.Err != nil {
log.Fatal(doc.Err)
}
}
func FromURL ¶
FromURL - get doc from URL
FromURL("https://url")
FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})
Example ¶
package main
import (
"log"
"github.com/msoap/html2data"
)
func main() {
doc := html2data.FromURL("http://example.com")
if doc.Err != nil {
log.Fatal(doc.Err)
}
// or with config
doc = html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
if doc.Err != nil {
log.Fatal(doc.Err)
}
}
func (Doc) GetData ¶
func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
GetData - extract data by CSS-selectors
texts, err := doc.GetData(map[string]string{"h1": "h1"})
Example ¶
package main
import (
"fmt"
"github.com/msoap/html2data"
)
func main() {
texts, _ := html2data.FromURL("http://example.com").GetData(map[string]string{"headers": "h1", "links": "a:attr(href)"})
// get all H1 headers:
if textOne, ok := texts["headers"]; ok {
for _, text := range textOne {
fmt.Println(text)
}
}
// get all urls from links
if links, ok := texts["links"]; ok {
for _, text := range links {
fmt.Println(text)
}
}
}
func (Doc) GetDataFirst ¶
func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
GetDataFirst - extract data by CSS-selectors, get first entry for each selector or ""
texts, err := doc.GetDataFirst(map[string]string{"h1": "h1"})
Example ¶
package main
import (
"fmt"
"log"
"github.com/msoap/html2data"
)
func main() {
texts, err := html2data.FromURL("http://example.com").GetDataFirst(map[string]string{"header": "h1", "first_link": "a:attr(href)"})
if err != nil {
log.Fatal(err)
}
// get H1 header:
fmt.Println("header: ", texts["header"])
// get URL in first link:
fmt.Println("first link: ", texts["first_link"])
}
func (Doc) GetDataNested ¶
func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
GetDataNested - extract nested data by CSS-selectors from another CSS-selector
texts, err := doc.GetDataNested("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main
import (
"fmt"
"github.com/msoap/html2data"
)
func main() {
texts, _ := html2data.FromFile("test.html").GetDataNested("div.article", map[string]string{"headers": "h1", "links": "a:attr(href)"})
for _, article := range texts {
// get all H1 headers inside each <div class="article">:
if textOne, ok := article["headers"]; ok {
for _, text := range textOne {
fmt.Println(text)
}
}
// get all urls from links inside each <div class="article">
if links, ok := article["links"]; ok {
for _, text := range links {
fmt.Println(text)
}
}
}
}
func (Doc) GetDataNestedFirst ¶
func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
GetDataNestedFirst - extract nested data by CSS-selectors from another CSS-selector get first entry for each selector or ""
texts, err := doc.GetDataNestedFirst("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main
import (
"fmt"
"log"
"github.com/msoap/html2data"
)
func main() {
texts, err := html2data.FromFile("cmd/html2data/test.html").GetDataNestedFirst("div.block", map[string]string{"header": "h1", "link": "a:attr(href)", "sp": "span"})
if err != nil {
log.Fatal(err)
}
fmt.Println("")
for _, block := range texts {
// get first H1 header
fmt.Printf("header - %s\n", block["header"])
// get first link
fmt.Printf("first URL - %s\n", block["link"])
// get not exists span
fmt.Printf("span - '%s'\n", block["span"])
}
}
Output: header - Head1.1 first URL - http://url1 span - '' header - Head2.1 first URL - http://url2 span - ''
func (Doc) GetDataSingle ¶
GetDataSingle - extract data by one CSS-selector
title, err := doc.GetDataSingle("title")
Example ¶
package main
import (
"fmt"
"log"
"github.com/msoap/html2data"
)
func main() {
// get title
title, err := html2data.FromFile("cmd/html2data/test.html").GetDataSingle("title")
if err != nil {
log.Fatal(err)
}
fmt.Println("Title is:", title)
}
Output: Title is: Title