drone/git/parser/text.go

144 lines
3.7 KiB
Go

// Copyright 2023 Harness, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package parser
import (
"bufio"
"errors"
"fmt"
"io"
"unicode/utf8"
)
var (
ErrLineTooLong = errors.New("line too long")
)
func newUTF8Scanner(inner Scanner, modifier func([]byte) []byte) *utf8Scanner {
return &utf8Scanner{
scanner: inner,
modifier: modifier,
}
}
// utf8Scanner is wrapping the provided scanner with UTF-8 checks and a modifier function.
type utf8Scanner struct {
nextLine []byte
nextErr error
modifier func([]byte) []byte
scanner Scanner
}
func (s *utf8Scanner) Scan() bool {
scanOut := s.scanner.Scan()
if !scanOut {
s.nextLine = nil
s.nextErr = s.scanner.Err()
// to stay consistent with diff parser, treat bufio.ErrTooLong as binary file
if errors.Is(s.nextErr, bufio.ErrTooLong) {
s.nextErr = ErrBinaryFile
}
return false
}
// finalize next bytes
original := s.scanner.Bytes()
// Git is using first 8000 chars, but for now we stay consistent with diff parser
// https://git.kernel.org/pub/scm/git/git.git/tree/xdiff-interface.c?h=v2.30.0#n187
if !utf8.Valid(original) {
s.nextLine = nil
s.nextErr = ErrBinaryFile
return false
}
// copy bytes to ensure nothing happens during modification
cpy := make([]byte, len(original))
copy(cpy, original)
if s.modifier != nil {
cpy = s.modifier(cpy)
}
s.nextLine = cpy
s.nextErr = nil
return true
}
func (s *utf8Scanner) Err() error {
return s.nextErr
}
func (s *utf8Scanner) Bytes() []byte {
return s.nextLine
}
func (s *utf8Scanner) Text() string {
return string(s.nextLine)
}
// ReadTextFile returns a Scanner that reads the provided text file line by line.
//
// The returned Scanner fulfills the following:
// - If any line is larger than 64kb, the scanning fails with ErrBinaryFile
// - If the reader returns invalid UTF-8, the scanning fails with ErrBinaryFile
// - Line endings are returned as-is, unless overwriteLE is provided
func ReadTextFile(r io.Reader, overwriteLE *string) (Scanner, string, error) {
scanner := NewScannerWithPeek(r, ScanLinesWithEOF)
peekOut := scanner.Peek()
if !peekOut && scanner.Err() != nil {
return nil, "", fmt.Errorf("unknown error while peeking first line: %w", scanner.Err())
}
// get raw bytes as we don't modify the slice
firstLine := scanner.Bytes()
// Heuristic - get line ending of file by first line, default to LF if there's no line endings in the file
lineEnding := "\n"
if HasLineEndingCRLF(firstLine) {
lineEnding = "\r\n"
}
return newUTF8Scanner(scanner, func(line []byte) []byte {
// overwrite line ending if requested (unless there's no line ending - e.g. last line)
if overwriteLE != nil {
if HasLineEndingCRLF(line) {
return append(line[:len(line)-2], []byte(*overwriteLE)...)
} else if HasLineEndingLF(line) {
return append(line[:len(line)-1], []byte(*overwriteLE)...)
}
}
return line
}), lineEnding, nil
}
func HasLineEnding(line []byte) bool {
// HasLineEndingLF is superset of HasLineEndingCRLF
return HasLineEndingLF(line)
}
func HasLineEndingLF(line []byte) bool {
return len(line) >= 1 && line[len(line)-1] == '\n'
}
func HasLineEndingCRLF(line []byte) bool {
return len(line) >= 2 && line[len(line)-2] == '\r' && line[len(line)-1] == '\n'
}