| 1 | // Copyright 2013 The Go Authors. All rights reserved. |
|---|---|
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // This program takes an HTML file and outputs a corresponding article file in |
| 6 | // present format. See: golang.org/x/tools/present |
| 7 | package main // import "golang.org/x/tools/cmd/html2article" |
| 8 | |
| 9 | import ( |
| 10 | "bytes" |
| 11 | "errors" |
| 12 | "flag" |
| 13 | "fmt" |
| 14 | "io" |
| 15 | "log" |
| 16 | "net/url" |
| 17 | "os" |
| 18 | "regexp" |
| 19 | "strings" |
| 20 | |
| 21 | "golang.org/x/net/html" |
| 22 | "golang.org/x/net/html/atom" |
| 23 | ) |
| 24 | |
| 25 | func main() { |
| 26 | flag.Parse() |
| 27 | |
| 28 | err := convert(os.Stdout, os.Stdin) |
| 29 | if err != nil { |
| 30 | log.Fatal(err) |
| 31 | } |
| 32 | } |
| 33 | |
| 34 | func convert(w io.Writer, r io.Reader) error { |
| 35 | root, err := html.Parse(r) |
| 36 | if err != nil { |
| 37 | return err |
| 38 | } |
| 39 | |
| 40 | style := find(root, isTag(atom.Style)) |
| 41 | if err := parseStyles(style); err != nil { |
| 42 | log.Printf("couldn't parse all styles: %v", err) |
| 43 | } |
| 44 | |
| 45 | body := find(root, isTag(atom.Body)) |
| 46 | if body == nil { |
| 47 | return errors.New("couldn't find body") |
| 48 | } |
| 49 | article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) |
| 50 | _, err = fmt.Fprintf(w, "Title\n\n%s", article) |
| 51 | return err |
| 52 | } |
| 53 | |
| 54 | type Style string |
| 55 | |
| 56 | const ( |
| 57 | Bold Style = "*" |
| 58 | Italic Style = "_" |
| 59 | Code Style = "`" |
| 60 | ) |
| 61 | |
| 62 | var cssRules = make(map[string]Style) |
| 63 | |
| 64 | func parseStyles(style *html.Node) error { |
| 65 | if style == nil || style.FirstChild == nil { |
| 66 | return errors.New("couldn't find styles") |
| 67 | } |
| 68 | |
| 69 | styles := style.FirstChild.Data |
| 70 | readUntil := func(end rune) (string, bool) { |
| 71 | i := strings.IndexRune(styles, end) |
| 72 | if i < 0 { |
| 73 | return "", false |
| 74 | } |
| 75 | s := styles[:i] |
| 76 | styles = styles[i:] |
| 77 | return s, true |
| 78 | } |
| 79 | |
| 80 | for { |
| 81 | sel, ok := readUntil('{') |
| 82 | if !ok && sel == "" { |
| 83 | break |
| 84 | } else if !ok { |
| 85 | return fmt.Errorf("could not parse selector %q", styles) |
| 86 | } |
| 87 | |
| 88 | value, ok := readUntil('}') |
| 89 | if !ok { |
| 90 | return fmt.Errorf("couldn't parse style body for %s", sel) |
| 91 | } |
| 92 | switch { |
| 93 | case strings.Contains(value, "italic"): |
| 94 | cssRules[sel] = Italic |
| 95 | case strings.Contains(value, "bold"): |
| 96 | cssRules[sel] = Bold |
| 97 | case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"): |
| 98 | cssRules[sel] = Code |
| 99 | } |
| 100 | } |
| 101 | return nil |
| 102 | } |
| 103 | |
| 104 | var newlineRun = regexp.MustCompile(`\n\n+`) |
| 105 | |
| 106 | func limitNewlineRuns(s string) string { |
| 107 | return newlineRun.ReplaceAllString(s, "\n\n") |
| 108 | } |
| 109 | |
| 110 | func makeHeadings(body string) string { |
| 111 | buf := new(bytes.Buffer) |
| 112 | lines := strings.Split(body, "\n") |
| 113 | for i, s := range lines { |
| 114 | if i == 0 && !isBoldTitle(s) { |
| 115 | buf.WriteString("* Introduction\n\n") |
| 116 | } |
| 117 | if isBoldTitle(s) { |
| 118 | s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) |
| 119 | s = "* " + s |
| 120 | } |
| 121 | buf.WriteString(s) |
| 122 | buf.WriteByte('\n') |
| 123 | } |
| 124 | return buf.String() |
| 125 | } |
| 126 | |
| 127 | func isBoldTitle(s string) bool { |
| 128 | return !strings.Contains(s, " ") && |
| 129 | strings.HasPrefix(s, "*") && |
| 130 | strings.HasSuffix(s, "*") |
| 131 | } |
| 132 | |
| 133 | func indent(buf *bytes.Buffer, s string) { |
| 134 | for _, l := range strings.Split(s, "\n") { |
| 135 | if l != "" { |
| 136 | buf.WriteByte('\t') |
| 137 | buf.WriteString(l) |
| 138 | } |
| 139 | buf.WriteByte('\n') |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | func unwrap(buf *bytes.Buffer, s string) { |
| 144 | var cont bool |
| 145 | for _, l := range strings.Split(s, "\n") { |
| 146 | l = strings.TrimSpace(l) |
| 147 | if len(l) == 0 { |
| 148 | if cont { |
| 149 | buf.WriteByte('\n') |
| 150 | buf.WriteByte('\n') |
| 151 | } |
| 152 | cont = false |
| 153 | } else { |
| 154 | if cont { |
| 155 | buf.WriteByte(' ') |
| 156 | } |
| 157 | buf.WriteString(l) |
| 158 | cont = true |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | func text(n *html.Node) string { |
| 164 | var buf bytes.Buffer |
| 165 | walk(n, func(n *html.Node) bool { |
| 166 | switch n.Type { |
| 167 | case html.TextNode: |
| 168 | buf.WriteString(n.Data) |
| 169 | return false |
| 170 | case html.ElementNode: |
| 171 | // no-op |
| 172 | default: |
| 173 | return true |
| 174 | } |
| 175 | a := n.DataAtom |
| 176 | if a == atom.Span { |
| 177 | switch { |
| 178 | case hasStyle(Code)(n): |
| 179 | a = atom.Code |
| 180 | case hasStyle(Bold)(n): |
| 181 | a = atom.B |
| 182 | case hasStyle(Italic)(n): |
| 183 | a = atom.I |
| 184 | } |
| 185 | } |
| 186 | switch a { |
| 187 | case atom.Br: |
| 188 | buf.WriteByte('\n') |
| 189 | case atom.P: |
| 190 | unwrap(&buf, childText(n)) |
| 191 | buf.WriteString("\n\n") |
| 192 | case atom.Li: |
| 193 | buf.WriteString("- ") |
| 194 | unwrap(&buf, childText(n)) |
| 195 | buf.WriteByte('\n') |
| 196 | case atom.Pre: |
| 197 | indent(&buf, childText(n)) |
| 198 | buf.WriteByte('\n') |
| 199 | case atom.A: |
| 200 | href, text := attr(n, "href"), childText(n) |
| 201 | // Skip links with no text. |
| 202 | if strings.TrimSpace(text) == "" { |
| 203 | break |
| 204 | } |
| 205 | // Don't emit empty links. |
| 206 | if strings.TrimSpace(href) == "" { |
| 207 | buf.WriteString(text) |
| 208 | break |
| 209 | } |
| 210 | // Use original url for Google Docs redirections. |
| 211 | if u, err := url.Parse(href); err != nil { |
| 212 | log.Printf("parsing url %q: %v", href, err) |
| 213 | } else if u.Host == "www.google.com" && u.Path == "/url" { |
| 214 | href = u.Query().Get("q") |
| 215 | } |
| 216 | fmt.Fprintf(&buf, "[[%s][%s]]", href, text) |
| 217 | case atom.Code: |
| 218 | buf.WriteString(highlight(n, "`")) |
| 219 | case atom.B: |
| 220 | buf.WriteString(highlight(n, "*")) |
| 221 | case atom.I: |
| 222 | buf.WriteString(highlight(n, "_")) |
| 223 | case atom.Img: |
| 224 | src := attr(n, "src") |
| 225 | fmt.Fprintf(&buf, ".image %s\n", src) |
| 226 | case atom.Iframe: |
| 227 | src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") |
| 228 | fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) |
| 229 | case atom.Param: |
| 230 | if attr(n, "name") == "movie" { |
| 231 | // Old style YouTube embed. |
| 232 | u := attr(n, "value") |
| 233 | u = strings.Replace(u, "/v/", "/embed/", 1) |
| 234 | if i := strings.Index(u, "&"); i >= 0 { |
| 235 | u = u[:i] |
| 236 | } |
| 237 | fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) |
| 238 | } |
| 239 | case atom.Title: |
| 240 | default: |
| 241 | return true |
| 242 | } |
| 243 | return false |
| 244 | }) |
| 245 | return buf.String() |
| 246 | } |
| 247 | |
| 248 | func childText(node *html.Node) string { |
| 249 | var buf bytes.Buffer |
| 250 | for n := node.FirstChild; n != nil; n = n.NextSibling { |
| 251 | fmt.Fprint(&buf, text(n)) |
| 252 | } |
| 253 | return buf.String() |
| 254 | } |
| 255 | |
| 256 | func highlight(node *html.Node, char string) string { |
| 257 | t := strings.Replace(childText(node), " ", char, -1) |
| 258 | return fmt.Sprintf("%s%s%s", char, t, char) |
| 259 | } |
| 260 | |
| 261 | type selector func(*html.Node) bool |
| 262 | |
| 263 | func isTag(a atom.Atom) selector { |
| 264 | return func(n *html.Node) bool { |
| 265 | return n.DataAtom == a |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | func hasClass(name string) selector { |
| 270 | return func(n *html.Node) bool { |
| 271 | for _, a := range n.Attr { |
| 272 | if a.Key == "class" { |
| 273 | for _, c := range strings.Fields(a.Val) { |
| 274 | if c == name { |
| 275 | return true |
| 276 | } |
| 277 | } |
| 278 | } |
| 279 | } |
| 280 | return false |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | func hasStyle(s Style) selector { |
| 285 | return func(n *html.Node) bool { |
| 286 | for rule, s2 := range cssRules { |
| 287 | if s2 != s { |
| 288 | continue |
| 289 | } |
| 290 | if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { |
| 291 | return true |
| 292 | } |
| 293 | if n.DataAtom.String() == rule { |
| 294 | return true |
| 295 | } |
| 296 | } |
| 297 | return false |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | func attr(node *html.Node, key string) (value string) { |
| 302 | for _, attr := range node.Attr { |
| 303 | if attr.Key == key { |
| 304 | return attr.Val |
| 305 | } |
| 306 | } |
| 307 | return "" |
| 308 | } |
| 309 | |
| 310 | func find(n *html.Node, fn selector) *html.Node { |
| 311 | var result *html.Node |
| 312 | walk(n, func(n *html.Node) bool { |
| 313 | if result != nil { |
| 314 | return false |
| 315 | } |
| 316 | if fn(n) { |
| 317 | result = n |
| 318 | return false |
| 319 | } |
| 320 | return true |
| 321 | }) |
| 322 | return result |
| 323 | } |
| 324 | |
| 325 | func walk(n *html.Node, fn selector) { |
| 326 | if fn(n) { |
| 327 | for c := n.FirstChild; c != nil; c = c.NextSibling { |
| 328 | walk(c, fn) |
| 329 | } |
| 330 | } |
| 331 | } |
| 332 |
Members