lf/scan.go

package main

import (
	"io"
	"io/ioutil"
	"log"
	"strconv"
	"unicode"
)

type tokenType byte

const (
	tokenEOF tokenType = iota
	// no explicit keyword type
	tokenIdent     // e.g. set, ratios, 1:2:3
	tokenColon     // :
	tokenPrefix    // $, !, &, / or ?
	tokenLBraces   // {{
	tokenRBraces   // }}
	tokenCommand   // in between a prefix to \n or between {{ and }}
	tokenSemicolon // ;
	// comments are stripped
)

type scanner struct {
	buf []byte    // input buffer
	off int       // current offset in buf
	chr byte      // current character in buf
	sem bool      // insert semicolon
	nln bool      // insert newline
	eof bool      // buffer ended
	key bool      // scanning keys
	blk bool      // scanning block
	cmd bool      // scanning command
	typ tokenType // scanned token type
	tok string    // scanned token value
	// TODO: pos
}

func newScanner(r io.Reader) *scanner {
	buf, err := ioutil.ReadAll(r)
	if err != nil {
		log.Printf("scanning: %s", err)
	}

	var eof bool
	var chr byte

	if len(buf) == 0 {
		eof = true
	} else {
		eof = false
		chr = buf[0]
	}

	return &scanner{
		buf: buf,
		eof: eof,
		chr: chr,
	}
}

func (s *scanner) next() {
	if s.off+1 < len(s.buf) {
		s.off++
		s.chr = s.buf[s.off]
		return
	}

	s.off = len(s.buf)
	s.chr = 0
	s.eof = true
}

func (s *scanner) peek() byte {
	if s.off+1 < len(s.buf) {
		return s.buf[s.off+1]
	}

	return 0
}

func isSpace(b byte) bool {
	return unicode.IsSpace(rune(b))
}

func isDigit(b byte) bool {
	return unicode.IsDigit(rune(b))
}

func isPrefix(b byte) bool {
	// TODO: how to differentiate slash in path vs search?
	return b == '$' || b == '!' || b == '&' // || b == '/' || b == '?'
}

func (s *scanner) scan() bool {
scan:
	switch {
	case s.eof:
		s.next()
		if s.sem {
			s.typ = tokenSemicolon
			s.tok = "\n"
			s.sem = false
			return true
		}
		if s.nln {
			s.typ = tokenSemicolon
			s.tok = "\n"
			s.nln = false
			return true
		}
		s.typ = tokenEOF
		s.tok = "EOF"
		return false
	case s.key:
		beg := s.off
		for !s.eof && !isSpace(s.chr) {
			s.next()
		}
		s.typ = tokenIdent
		s.tok = string(s.buf[beg:s.off])
		s.key = false
	case s.blk:
		// return here by setting s.cmd to false
		// after scanning the command in the loop below
		if !s.cmd {
			s.next()
			s.next()
			s.typ = tokenRBraces
			s.tok = "}}"
			s.blk = false
			s.sem = true
			return true
		}

		beg := s.off

		for !s.eof {
			s.next()
			if s.chr == '}' {
				if !s.eof && s.peek() == '}' {
					s.typ = tokenCommand
					s.tok = string(s.buf[beg:s.off])
					s.cmd = false
					return true
				}
			}
		}

		s.typ = tokenEOF
		s.tok = "EOF"
		return false
	case s.cmd:
		for !s.eof && isSpace(s.chr) {
			s.next()
		}

		if !s.eof && s.chr == '{' {
			if s.peek() == '{' {
				s.next()
				s.next()
				s.typ = tokenLBraces
				s.tok = "{{"
				s.blk = true
				return true
			}
		}

		beg := s.off

		for !s.eof && s.chr != '\n' {
			s.next()
		}

		s.typ = tokenCommand
		s.tok = string(s.buf[beg:s.off])
		s.cmd = false
		s.sem = true
	case s.chr == '\r':
		s.next()
		goto scan
	case s.chr == '\n':
		if s.sem {
			s.typ = tokenSemicolon
			s.tok = "\n"
			s.sem = false
			return true
		}
		s.next()
		if s.nln {
			s.typ = tokenSemicolon
			s.tok = "\n"
			s.nln = false
			return true
		}
		goto scan
	case isSpace(s.chr):
		for !s.eof && isSpace(s.chr) {
			s.next()
		}
		goto scan
	case s.chr == ';':
		s.typ = tokenSemicolon
		s.tok = ";"
		s.sem = false
		s.next()
	case s.chr == '#':
		for !s.eof && s.chr != '\n' {
			s.next()
		}
		goto scan
	case s.chr == '"':
		s.next()
		var buf []byte
		for !s.eof && s.chr != '"' {
			if s.chr == '\\' {
				s.next()
				switch s.chr {
				case '"', '\\':
					buf = append(buf, s.chr)
				case 'a':
					buf = append(buf, '\a')
				case 'b':
					buf = append(buf, '\b')
				case 'f':
					buf = append(buf, '\f')
				case 'n':
					buf = append(buf, '\n')
				case 'r':
					buf = append(buf, '\r')
				case 't':
					buf = append(buf, '\t')
				case 'v':
					buf = append(buf, '\v')
				}
				if isDigit(s.chr) {
					var oct []byte
					for isDigit(s.chr) {
						oct = append(oct, s.chr)
						s.next()
					}
					n, err := strconv.ParseInt(string(oct), 8, 0)
					if err != nil {
						log.Printf("scanning: %s", err)
					}
					buf = append(buf, byte(n))
				} else {
					s.next()
				}
			} else {
				buf = append(buf, s.chr)
				s.next()
			}
		}
		s.typ = tokenIdent
		s.tok = string(buf)
		s.next()
	case s.chr == '\'':
		s.next()
		beg := s.off
		for !s.eof && s.chr != '\'' {
			s.next()
		}
		s.typ = tokenIdent
		s.tok = string(s.buf[beg:s.off])
		s.next()
	case s.chr == ':':
		s.typ = tokenColon
		s.tok = ":"
		s.nln = true
		s.next()
	case s.chr == '{' && s.peek() == '{':
		s.next()
		s.next()
		s.typ = tokenLBraces
		s.tok = "{{"
		s.sem = false
		s.nln = false
	case s.chr == '}' && s.peek() == '}':
		s.next()
		s.next()
		s.typ = tokenRBraces
		s.tok = "}}"
		s.sem = true
	case isPrefix(s.chr):
		s.typ = tokenPrefix
		s.tok = string(s.chr)
		s.cmd = true
		s.next()
	default:
		var buf []byte
		for !s.eof && !isSpace(s.chr) && s.chr != ';' && s.chr != '#' {
			if s.chr == '\\' {
				s.next()
				if isSpace(s.chr) {
					buf = append(buf, s.chr)
					s.next()
				} else {
					s.next()
				}
			}
			buf = append(buf, s.chr)
			s.next()
		}

		s.typ = tokenIdent
		s.tok = string(buf)
		s.sem = true

		if s.tok == "push" {
			s.key = true
			for !s.eof && isSpace(s.chr) && s.chr != '\n' {
				s.next()
			}
		}
	}

	return true
}
initial commit 2016-08-13 12:49:04 +00:00			`package main`

			`import (`
			`"io"`
			`"io/ioutil"`
			`"log"`
implement octal escapes in double quotes Mentioned in #47. 2017-03-15 21:13:26 +00:00			`"strconv"`
initial commit 2016-08-13 12:49:04 +00:00			`"unicode"`
			`)`

cleanup 2016-12-18 15:01:45 +00:00			`type tokenType byte`
initial commit 2016-08-13 12:49:04 +00:00
			`const (`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`tokenEOF tokenType = iota`
initial commit 2016-08-13 12:49:04 +00:00			`// no explicit keyword type`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`tokenIdent // e.g. set, ratios, 1:2:3`
			`tokenColon // :`
			`tokenPrefix // $, !, &, / or ?`
			`tokenLBraces // {{`
			`tokenRBraces // }}`
			`tokenCommand // in between a prefix to \n or between {{ and }}`
			`tokenSemicolon // ;`
initial commit 2016-08-13 12:49:04 +00:00			`// comments are stripped`
			`)`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`type scanner struct {`
initial commit 2016-08-13 12:49:04 +00:00			`buf []byte // input buffer`
			`off int // current offset in buf`
			`chr byte // current character in buf`
			`sem bool // insert semicolon`
			`nln bool // insert newline`
			`eof bool // buffer ended`
handle keys of push command while scanning 2016-10-15 18:46:31 +00:00			`key bool // scanning keys`
initial commit 2016-08-13 12:49:04 +00:00			`blk bool // scanning block`
			`cmd bool // scanning command`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`typ tokenType // scanned token type`
initial commit 2016-08-13 12:49:04 +00:00			`tok string // scanned token value`
			`// TODO: pos`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`func newScanner(r io.Reader) *scanner {`
initial commit 2016-08-13 12:49:04 +00:00			`buf, err := ioutil.ReadAll(r)`
			`if err != nil {`
cleanup 2016-09-02 20:13:37 +00:00			`log.Printf("scanning: %s", err)`
initial commit 2016-08-13 12:49:04 +00:00			`}`

			`var eof bool`
			`var chr byte`

			`if len(buf) == 0 {`
			`eof = true`
			`} else {`
			`eof = false`
			`chr = buf[0]`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`return &scanner{`
initial commit 2016-08-13 12:49:04 +00:00			`buf: buf,`
			`eof: eof,`
			`chr: chr,`
			`}`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`func (s *scanner) next() {`
initial commit 2016-08-13 12:49:04 +00:00			`if s.off+1 < len(s.buf) {`
			`s.off++`
			`s.chr = s.buf[s.off]`
			`return`
			`}`

			`s.off = len(s.buf)`
			`s.chr = 0`
			`s.eof = true`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`func (s *scanner) peek() byte {`
initial commit 2016-08-13 12:49:04 +00:00			`if s.off+1 < len(s.buf) {`
			`return s.buf[s.off+1]`
			`}`

			`return 0`
			`}`

			`func isSpace(b byte) bool {`
			`return unicode.IsSpace(rune(b))`
			`}`

implement octal escapes in double quotes Mentioned in #47. 2017-03-15 21:13:26 +00:00			`func isDigit(b byte) bool {`
			`return unicode.IsDigit(rune(b))`
			`}`

initial commit 2016-08-13 12:49:04 +00:00			`func isPrefix(b byte) bool {`
			`// TODO: how to differentiate slash in path vs search?`
			`return b == '$' \|\| b == '!' \|\| b == '&' // \|\| b == '/' \|\| b == '?'`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`func (s *scanner) scan() bool {`
initial commit 2016-08-13 12:49:04 +00:00			`scan:`
			`switch {`
			`case s.eof:`
			`s.next()`
			`if s.sem {`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenSemicolon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "\n"`
			`s.sem = false`
			`return true`
			`}`
			`if s.nln {`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenSemicolon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "\n"`
			`s.nln = false`
			`return true`
			`}`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenEOF`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`s.tok = "EOF"`
initial commit 2016-08-13 12:49:04 +00:00			`return false`
handle keys of push command while scanning 2016-10-15 18:46:31 +00:00			`case s.key:`
			`beg := s.off`
			`for !s.eof && !isSpace(s.chr) {`
			`s.next()`
			`}`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenIdent`
handle keys of push command while scanning 2016-10-15 18:46:31 +00:00			`s.tok = string(s.buf[beg:s.off])`
			`s.key = false`
initial commit 2016-08-13 12:49:04 +00:00			`case s.blk:`
			`// return here by setting s.cmd to false`
			`// after scanning the command in the loop below`
			`if !s.cmd {`
			`s.next()`
			`s.next()`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenRBraces`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "}}"`
			`s.blk = false`
			`s.sem = true`
			`return true`
			`}`

			`beg := s.off`

			`for !s.eof {`
			`s.next()`
			`if s.chr == '}' {`
			`if !s.eof && s.peek() == '}' {`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenCommand`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = string(s.buf[beg:s.off])`
			`s.cmd = false`
			`return true`
			`}`
			`}`
			`}`
remove error handling in scanner 2016-10-15 19:53:07 +00:00
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenEOF`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`s.tok = "EOF"`
initial commit 2016-08-13 12:49:04 +00:00			`return false`
			`case s.cmd:`
			`for !s.eof && isSpace(s.chr) {`
			`s.next()`
			`}`

			`if !s.eof && s.chr == '{' {`
			`if s.peek() == '{' {`
			`s.next()`
			`s.next()`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenLBraces`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "{{"`
			`s.blk = true`
			`return true`
			`}`
			`}`

			`beg := s.off`

			`for !s.eof && s.chr != '\n' {`
			`s.next()`
			`}`

use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenCommand`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = string(s.buf[beg:s.off])`
			`s.cmd = false`
			`s.sem = true`
handle dos line endings in config files 2017-10-30 20:28:34 +00:00			`case s.chr == '\r':`
			`s.next()`
			`goto scan`
initial commit 2016-08-13 12:49:04 +00:00			`case s.chr == '\n':`
			`if s.sem {`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenSemicolon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "\n"`
			`s.sem = false`
			`return true`
			`}`
parse list expressions without extra semicolon 2016-10-15 12:56:53 +00:00			`s.next()`
initial commit 2016-08-13 12:49:04 +00:00			`if s.nln {`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenSemicolon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = "\n"`
			`s.nln = false`
			`return true`
			`}`
			`goto scan`
			`case isSpace(s.chr):`
			`for !s.eof && isSpace(s.chr) {`
			`s.next()`
			`}`
			`goto scan`
			`case s.chr == ';':`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenSemicolon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = ";"`
			`s.sem = false`
			`s.next()`
			`case s.chr == '#':`
			`for !s.eof && s.chr != '\n' {`
			`s.next()`
			`}`
			`goto scan`
implement double quotes with escape characters Mentioned in #60. 2017-02-11 11:42:47 +00:00			`case s.chr == '"':`
			`s.next()`
			`var buf []byte`
			`for !s.eof && s.chr != '"' {`
			`if s.chr == '\\' {`
			`s.next()`
			`switch s.chr {`
			`case '"', '\\':`
			`buf = append(buf, s.chr)`
			`case 'a':`
			`buf = append(buf, '\a')`
			`case 'b':`
			`buf = append(buf, '\b')`
			`case 'f':`
			`buf = append(buf, '\f')`
			`case 'n':`
			`buf = append(buf, '\n')`
			`case 'r':`
			`buf = append(buf, '\r')`
			`case 't':`
			`buf = append(buf, '\t')`
			`case 'v':`
			`buf = append(buf, '\v')`
			`}`
implement octal escapes in double quotes Mentioned in #47. 2017-03-15 21:13:26 +00:00			`if isDigit(s.chr) {`
			`var oct []byte`
			`for isDigit(s.chr) {`
			`oct = append(oct, s.chr)`
			`s.next()`
			`}`
			`n, err := strconv.ParseInt(string(oct), 8, 0)`
			`if err != nil {`
			`log.Printf("scanning: %s", err)`
			`}`
			`buf = append(buf, byte(n))`
			`} else {`
			`s.next()`
			`}`
implement double quotes with escape characters Mentioned in #60. 2017-02-11 11:42:47 +00:00			`} else {`
			`buf = append(buf, s.chr)`
			`s.next()`
			`}`
			`}`
			`s.typ = tokenIdent`
			`s.tok = string(buf)`
			`s.next()`
add quoting for single quote 2016-11-22 18:31:20 +00:00			`case s.chr == '\'':`
			`s.next()`
			`beg := s.off`
			`for !s.eof && s.chr != '\'' {`
			`s.next()`
			`}`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenIdent`
add quoting for single quote 2016-11-22 18:31:20 +00:00			`s.tok = string(s.buf[beg:s.off])`
			`s.next()`
initial commit 2016-08-13 12:49:04 +00:00			`case s.chr == ':':`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenColon`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = ":"`
			`s.nln = true`
			`s.next()`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`case s.chr == '{' && s.peek() == '{':`
			`s.next()`
			`s.next()`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenLBraces`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`s.tok = "{{"`
			`s.sem = false`
			`s.nln = false`
			`case s.chr == '}' && s.peek() == '}':`
			`s.next()`
			`s.next()`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenRBraces`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`s.tok = "}}"`
			`s.sem = true`
initial commit 2016-08-13 12:49:04 +00:00			`case isPrefix(s.chr):`
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenPrefix`
initial commit 2016-08-13 12:49:04 +00:00			`s.tok = string(s.chr)`
			`s.cmd = true`
			`s.next()`
			`default:`
add whitespace escaping for backslash 2016-11-22 19:12:47 +00:00			`var buf []byte`
initial commit 2016-08-13 12:49:04 +00:00			`for !s.eof && !isSpace(s.chr) && s.chr != ';' && s.chr != '#' {`
add whitespace escaping for backslash 2016-11-22 19:12:47 +00:00			`if s.chr == '\\' {`
			`s.next()`
			`if isSpace(s.chr) {`
			`buf = append(buf, s.chr)`
			`s.next()`
			`} else {`
			`s.next()`
			`}`
			`}`
			`buf = append(buf, s.chr)`
initial commit 2016-08-13 12:49:04 +00:00			`s.next()`
			`}`
remove error handling in scanner 2016-10-15 19:53:07 +00:00
use lowercase initials for type names 2016-12-17 21:47:37 +00:00			`s.typ = tokenIdent`
add whitespace escaping for backslash 2016-11-22 19:12:47 +00:00			`s.tok = string(buf)`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`s.sem = true`

handle keys of push command while scanning 2016-10-15 18:46:31 +00:00			`if s.tok == "push" {`
			`s.key = true`
remove error handling in scanner 2016-10-15 19:53:07 +00:00			`for !s.eof && isSpace(s.chr) && s.chr != '\n' {`
handle keys of push command while scanning 2016-10-15 18:46:31 +00:00			`s.next()`
			`}`
			`}`
initial commit 2016-08-13 12:49:04 +00:00			`}`

			`return true`
			`}`