 30ce3731a1
			
		
	
	30ce3731a1
	
	
	
		
			
			* denisenkom/go-mssqldb untagged -> v0.9.0 * github.com/editorconfig/editorconfig-core-go v2.3.7 -> v2.3.8 * github.com/go-testfixtures/testfixtures v3.4.0 -> v3.4.1 * github.com/mholt/archiver v3.3.2 -> v3.5.0 * github.com/olivere/elastic v7.0.20 -> v7.0.21 * github.com/urfave/cli v1.22.4 -> v1.22.5 * github.com/xanzy/go-gitlab v0.38.1 -> v0.39.0 * github.com/yuin/goldmark-meta untagged -> v1.0.0 * github.com/ethantkoenig/rupture 0a76f03a811a -> c3b3b810dc77 * github.com/jaytaylor/html2text 8fb95d837f7d -> 3577fbdbcff7 * github.com/kballard/go-shellquote cd60e84ee657 -> 95032a82bc51 * github.com/msteinert/pam 02ccfbfaf0cc -> 913b8f8cdf8b * github.com/unknwon/paginater 7748a72e0141 -> 042474bd0eae * CI.restart() Co-authored-by: techknowlogick <techknowlogick@gitea.io>
		
			
				
	
	
		
			536 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
			
		
		
	
	
			536 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
| package html2text
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"io"
 | |
| 	"regexp"
 | |
| 	"strings"
 | |
| 	"unicode"
 | |
| 
 | |
| 	"github.com/olekukonko/tablewriter"
 | |
| 	"github.com/ssor/bom"
 | |
| 	"golang.org/x/net/html"
 | |
| 	"golang.org/x/net/html/atom"
 | |
| )
 | |
| 
 | |
| // Options provide toggles and overrides to control specific rendering behaviors.
 | |
| type Options struct {
 | |
| 	PrettyTables        bool                 // Turns on pretty ASCII rendering for table elements.
 | |
| 	PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
 | |
| 	OmitLinks           bool                 // Turns on omitting links
 | |
| }
 | |
| 
 | |
| // PrettyTablesOptions overrides tablewriter behaviors
 | |
| type PrettyTablesOptions struct {
 | |
| 	AutoFormatHeader     bool
 | |
| 	AutoWrapText         bool
 | |
| 	ReflowDuringAutoWrap bool
 | |
| 	ColWidth             int
 | |
| 	ColumnSeparator      string
 | |
| 	RowSeparator         string
 | |
| 	CenterSeparator      string
 | |
| 	HeaderAlignment      int
 | |
| 	FooterAlignment      int
 | |
| 	Alignment            int
 | |
| 	ColumnAlignment      []int
 | |
| 	NewLine              string
 | |
| 	HeaderLine           bool
 | |
| 	RowLine              bool
 | |
| 	AutoMergeCells       bool
 | |
| 	Borders              tablewriter.Border
 | |
| }
 | |
| 
 | |
| // NewPrettyTablesOptions creates PrettyTablesOptions with default settings
 | |
| func NewPrettyTablesOptions() *PrettyTablesOptions {
 | |
| 	return &PrettyTablesOptions{
 | |
| 		AutoFormatHeader:     true,
 | |
| 		AutoWrapText:         true,
 | |
| 		ReflowDuringAutoWrap: true,
 | |
| 		ColWidth:             tablewriter.MAX_ROW_WIDTH,
 | |
| 		ColumnSeparator:      tablewriter.COLUMN,
 | |
| 		RowSeparator:         tablewriter.ROW,
 | |
| 		CenterSeparator:      tablewriter.CENTER,
 | |
| 		HeaderAlignment:      tablewriter.ALIGN_DEFAULT,
 | |
| 		FooterAlignment:      tablewriter.ALIGN_DEFAULT,
 | |
| 		Alignment:            tablewriter.ALIGN_DEFAULT,
 | |
| 		ColumnAlignment:      []int{},
 | |
| 		NewLine:              tablewriter.NEWLINE,
 | |
| 		HeaderLine:           true,
 | |
| 		RowLine:              false,
 | |
| 		AutoMergeCells:       false,
 | |
| 		Borders:              tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // FromHTMLNode renders text output from a pre-parsed HTML document.
 | |
| func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
 | |
| 	var options Options
 | |
| 	if len(o) > 0 {
 | |
| 		options = o[0]
 | |
| 	}
 | |
| 
 | |
| 	ctx := textifyTraverseContext{
 | |
| 		buf:     bytes.Buffer{},
 | |
| 		options: options,
 | |
| 	}
 | |
| 	if err := ctx.traverse(doc); err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 
 | |
| 	text := strings.TrimSpace(newlineRe.ReplaceAllString(
 | |
| 		strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
 | |
| 	)
 | |
| 	return text, nil
 | |
| }
 | |
| 
 | |
| // FromReader renders text output after parsing HTML for the specified
 | |
| // io.Reader.
 | |
| func FromReader(reader io.Reader, options ...Options) (string, error) {
 | |
| 	newReader, err := bom.NewReaderWithoutBom(reader)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	doc, err := html.Parse(newReader)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return FromHTMLNode(doc, options...)
 | |
| }
 | |
| 
 | |
| // FromString parses HTML from the input string, then renders the text form.
 | |
| func FromString(input string, options ...Options) (string, error) {
 | |
| 	bs := bom.CleanBom([]byte(input))
 | |
| 	text, err := FromReader(bytes.NewReader(bs), options...)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return text, nil
 | |
| }
 | |
| 
 | |
| var (
 | |
| 	spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
 | |
| 	newlineRe = regexp.MustCompile(`\n\n+`)
 | |
| )
 | |
| 
 | |
| // traverseTableCtx holds text-related context.
 | |
| type textifyTraverseContext struct {
 | |
| 	buf bytes.Buffer
 | |
| 
 | |
| 	prefix          string
 | |
| 	tableCtx        tableTraverseContext
 | |
| 	options         Options
 | |
| 	endsWithSpace   bool
 | |
| 	justClosedDiv   bool
 | |
| 	blockquoteLevel int
 | |
| 	lineLength      int
 | |
| 	isPre           bool
 | |
| }
 | |
| 
 | |
| // tableTraverseContext holds table ASCII-form related context.
 | |
| type tableTraverseContext struct {
 | |
| 	header     []string
 | |
| 	body       [][]string
 | |
| 	footer     []string
 | |
| 	tmpRow     int
 | |
| 	isInFooter bool
 | |
| }
 | |
| 
 | |
| func (tableCtx *tableTraverseContext) init() {
 | |
| 	tableCtx.body = [][]string{}
 | |
| 	tableCtx.header = []string{}
 | |
| 	tableCtx.footer = []string{}
 | |
| 	tableCtx.isInFooter = false
 | |
| 	tableCtx.tmpRow = 0
 | |
| }
 | |
| 
 | |
| func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
 | |
| 	ctx.justClosedDiv = false
 | |
| 
 | |
| 	switch node.DataAtom {
 | |
| 	case atom.Br:
 | |
| 		return ctx.emit("\n")
 | |
| 
 | |
| 	case atom.H1, atom.H2, atom.H3:
 | |
| 		subCtx := textifyTraverseContext{}
 | |
| 		if err := subCtx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		str := subCtx.buf.String()
 | |
| 		dividerLen := 0
 | |
| 		for _, line := range strings.Split(str, "\n") {
 | |
| 			if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
 | |
| 				dividerLen = lineLen - 1
 | |
| 			}
 | |
| 		}
 | |
| 		var divider string
 | |
| 		if node.DataAtom == atom.H1 {
 | |
| 			divider = strings.Repeat("*", dividerLen)
 | |
| 		} else {
 | |
| 			divider = strings.Repeat("-", dividerLen)
 | |
| 		}
 | |
| 
 | |
| 		if node.DataAtom == atom.H3 {
 | |
| 			return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
 | |
| 		}
 | |
| 		return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
 | |
| 
 | |
| 	case atom.Blockquote:
 | |
| 		ctx.blockquoteLevel++
 | |
| 		ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
 | |
| 		if err := ctx.emit("\n"); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		if ctx.blockquoteLevel == 1 {
 | |
| 			if err := ctx.emit("\n"); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		ctx.blockquoteLevel--
 | |
| 		ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
 | |
| 		if ctx.blockquoteLevel > 0 {
 | |
| 			ctx.prefix += " "
 | |
| 		}
 | |
| 		return ctx.emit("\n\n")
 | |
| 
 | |
| 	case atom.Div:
 | |
| 		if ctx.lineLength > 0 {
 | |
| 			if err := ctx.emit("\n"); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		var err error
 | |
| 		if !ctx.justClosedDiv {
 | |
| 			err = ctx.emit("\n")
 | |
| 		}
 | |
| 		ctx.justClosedDiv = true
 | |
| 		return err
 | |
| 
 | |
| 	case atom.Li:
 | |
| 		if err := ctx.emit("* "); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		return ctx.emit("\n")
 | |
| 
 | |
| 	case atom.B, atom.Strong:
 | |
| 		subCtx := textifyTraverseContext{}
 | |
| 		subCtx.endsWithSpace = true
 | |
| 		if err := subCtx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		str := subCtx.buf.String()
 | |
| 		return ctx.emit("*" + str + "*")
 | |
| 
 | |
| 	case atom.A:
 | |
| 		linkText := ""
 | |
| 		// For simple link element content with single text node only, peek at the link text.
 | |
| 		if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode {
 | |
| 			linkText = node.FirstChild.Data
 | |
| 		}
 | |
| 
 | |
| 		// If image is the only child, take its alt text as the link text.
 | |
| 		if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
 | |
| 			if altText := getAttrVal(img, "alt"); altText != "" {
 | |
| 				if err := ctx.emit(altText); err != nil {
 | |
| 					return err
 | |
| 				}
 | |
| 			}
 | |
| 		} else if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		hrefLink := ""
 | |
| 		if attrVal := getAttrVal(node, "href"); attrVal != "" {
 | |
| 			attrVal = ctx.normalizeHrefLink(attrVal)
 | |
| 			// Don't print link href if it matches link element content or if the link is empty.
 | |
| 			if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
 | |
| 				hrefLink = "( " + attrVal + " )"
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		return ctx.emit(hrefLink)
 | |
| 
 | |
| 	case atom.P, atom.Ul:
 | |
| 		return ctx.paragraphHandler(node)
 | |
| 
 | |
| 	case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
 | |
| 		if ctx.options.PrettyTables {
 | |
| 			return ctx.handleTableElement(node)
 | |
| 		} else if node.DataAtom == atom.Table {
 | |
| 			return ctx.paragraphHandler(node)
 | |
| 		}
 | |
| 		return ctx.traverseChildren(node)
 | |
| 
 | |
| 	case atom.Pre:
 | |
| 		ctx.isPre = true
 | |
| 		err := ctx.traverseChildren(node)
 | |
| 		ctx.isPre = false
 | |
| 		return err
 | |
| 
 | |
| 	case atom.Style, atom.Script, atom.Head:
 | |
| 		// Ignore the subtree.
 | |
| 		return nil
 | |
| 
 | |
| 	default:
 | |
| 		return ctx.traverseChildren(node)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // paragraphHandler renders node children surrounded by double newlines.
 | |
| func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error {
 | |
| 	if err := ctx.emit("\n\n"); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := ctx.traverseChildren(node); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return ctx.emit("\n\n")
 | |
| }
 | |
| 
 | |
| // handleTableElement is only to be invoked when options.PrettyTables is active.
 | |
| func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
 | |
| 	if !ctx.options.PrettyTables {
 | |
| 		panic("handleTableElement invoked when PrettyTables not active")
 | |
| 	}
 | |
| 
 | |
| 	switch node.DataAtom {
 | |
| 	case atom.Table:
 | |
| 		if err := ctx.emit("\n\n"); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		// Re-intialize all table context.
 | |
| 		ctx.tableCtx.init()
 | |
| 
 | |
| 		// Browse children, enriching context with table data.
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		buf := &bytes.Buffer{}
 | |
| 		table := tablewriter.NewWriter(buf)
 | |
| 		if ctx.options.PrettyTablesOptions != nil {
 | |
| 			options := ctx.options.PrettyTablesOptions
 | |
| 			table.SetAutoFormatHeaders(options.AutoFormatHeader)
 | |
| 			table.SetAutoWrapText(options.AutoWrapText)
 | |
| 			table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap)
 | |
| 			table.SetColWidth(options.ColWidth)
 | |
| 			table.SetColumnSeparator(options.ColumnSeparator)
 | |
| 			table.SetRowSeparator(options.RowSeparator)
 | |
| 			table.SetCenterSeparator(options.CenterSeparator)
 | |
| 			table.SetHeaderAlignment(options.HeaderAlignment)
 | |
| 			table.SetFooterAlignment(options.FooterAlignment)
 | |
| 			table.SetAlignment(options.Alignment)
 | |
| 			table.SetColumnAlignment(options.ColumnAlignment)
 | |
| 			table.SetNewLine(options.NewLine)
 | |
| 			table.SetHeaderLine(options.HeaderLine)
 | |
| 			table.SetRowLine(options.RowLine)
 | |
| 			table.SetAutoMergeCells(options.AutoMergeCells)
 | |
| 			table.SetBorders(options.Borders)
 | |
| 		}
 | |
| 		table.SetHeader(ctx.tableCtx.header)
 | |
| 		table.SetFooter(ctx.tableCtx.footer)
 | |
| 		table.AppendBulk(ctx.tableCtx.body)
 | |
| 
 | |
| 		// Render the table using ASCII.
 | |
| 		table.Render()
 | |
| 		if err := ctx.emit(buf.String()); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		return ctx.emit("\n\n")
 | |
| 
 | |
| 	case atom.Tfoot:
 | |
| 		ctx.tableCtx.isInFooter = true
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		ctx.tableCtx.isInFooter = false
 | |
| 
 | |
| 	case atom.Tr:
 | |
| 		ctx.tableCtx.body = append(ctx.tableCtx.body, []string{})
 | |
| 		if err := ctx.traverseChildren(node); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		ctx.tableCtx.tmpRow++
 | |
| 
 | |
| 	case atom.Th:
 | |
| 		res, err := ctx.renderEachChild(node)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		ctx.tableCtx.header = append(ctx.tableCtx.header, res)
 | |
| 
 | |
| 	case atom.Td:
 | |
| 		res, err := ctx.renderEachChild(node)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		if ctx.tableCtx.isInFooter {
 | |
| 			ctx.tableCtx.footer = append(ctx.tableCtx.footer, res)
 | |
| 		} else {
 | |
| 			ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res)
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
 | |
| 	switch node.Type {
 | |
| 	default:
 | |
| 		return ctx.traverseChildren(node)
 | |
| 
 | |
| 	case html.TextNode:
 | |
| 		var data string
 | |
| 		if ctx.isPre {
 | |
| 			data = node.Data
 | |
| 		} else {
 | |
| 			data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
 | |
| 		}
 | |
| 		return ctx.emit(data)
 | |
| 
 | |
| 	case html.ElementNode:
 | |
| 		return ctx.handleElement(node)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
 | |
| 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 | |
| 		if err := ctx.traverse(c); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (ctx *textifyTraverseContext) emit(data string) error {
 | |
| 	if data == "" {
 | |
| 		return nil
 | |
| 	}
 | |
| 	var (
 | |
| 		lines = ctx.breakLongLines(data)
 | |
| 		err   error
 | |
| 	)
 | |
| 	for _, line := range lines {
 | |
| 		runes := []rune(line)
 | |
| 		startsWithSpace := unicode.IsSpace(runes[0])
 | |
| 		if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
 | |
| 			if err = ctx.buf.WriteByte(' '); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			ctx.lineLength++
 | |
| 		}
 | |
| 		ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
 | |
| 		for _, c := range line {
 | |
| 			if _, err = ctx.buf.WriteString(string(c)); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			ctx.lineLength++
 | |
| 			if c == '\n' {
 | |
| 				ctx.lineLength = 0
 | |
| 				if ctx.prefix != "" {
 | |
| 					if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
 | |
| 						return err
 | |
| 					}
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| const maxLineLen = 74
 | |
| 
 | |
| func (ctx *textifyTraverseContext) breakLongLines(data string) []string {
 | |
| 	// Only break lines when in blockquotes.
 | |
| 	if ctx.blockquoteLevel == 0 {
 | |
| 		return []string{data}
 | |
| 	}
 | |
| 	var (
 | |
| 		ret      = []string{}
 | |
| 		runes    = []rune(data)
 | |
| 		l        = len(runes)
 | |
| 		existing = ctx.lineLength
 | |
| 	)
 | |
| 	if existing >= maxLineLen {
 | |
| 		ret = append(ret, "\n")
 | |
| 		existing = 0
 | |
| 	}
 | |
| 	for l+existing > maxLineLen {
 | |
| 		i := maxLineLen - existing
 | |
| 		for i >= 0 && !unicode.IsSpace(runes[i]) {
 | |
| 			i--
 | |
| 		}
 | |
| 		if i == -1 {
 | |
| 			// No spaces, so go the other way.
 | |
| 			i = maxLineLen - existing
 | |
| 			for i < l && !unicode.IsSpace(runes[i]) {
 | |
| 				i++
 | |
| 			}
 | |
| 		}
 | |
| 		ret = append(ret, string(runes[:i])+"\n")
 | |
| 		for i < l && unicode.IsSpace(runes[i]) {
 | |
| 			i++
 | |
| 		}
 | |
| 		runes = runes[i:]
 | |
| 		l = len(runes)
 | |
| 		existing = 0
 | |
| 	}
 | |
| 	if len(runes) > 0 {
 | |
| 		ret = append(ret, string(runes))
 | |
| 	}
 | |
| 	return ret
 | |
| }
 | |
| 
 | |
| func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
 | |
| 	link = strings.TrimSpace(link)
 | |
| 	link = strings.TrimPrefix(link, "mailto:")
 | |
| 	return link
 | |
| }
 | |
| 
 | |
| // renderEachChild visits each direct child of a node and collects the sequence of
 | |
| // textuual representaitons separated by a single newline.
 | |
| func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
 | |
| 	buf := &bytes.Buffer{}
 | |
| 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 | |
| 		s, err := FromHTMLNode(c, ctx.options)
 | |
| 		if err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 		if _, err = buf.WriteString(s); err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 		if c.NextSibling != nil {
 | |
| 			if err = buf.WriteByte('\n'); err != nil {
 | |
| 				return "", err
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return buf.String(), nil
 | |
| }
 | |
| 
 | |
| func getAttrVal(node *html.Node, attrName string) string {
 | |
| 	for _, attr := range node.Attr {
 | |
| 		if attr.Key == attrName {
 | |
| 			return attr.Val
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return ""
 | |
| }
 |