Handle symlinks and submodules in diffcut (#1185)

pull/3497/head
Marko Gacesa 2024-04-08 10:46:40 +00:00 committed by Harness
parent f879f74774
commit ad5989dae4
6 changed files with 352 additions and 50 deletions

View File

@ -407,14 +407,13 @@ func (g *Git) DiffCut(
return parser.HunkHeader{}, parser.Hunk{}, fmt.Errorf("failed to find the list of changed files: %w", err)
}
var (
oldSHA, newSHA string
filePath string
)
var oldSHA, newSHA string
for _, entry := range diffEntries {
switch entry.Status {
case parser.DiffStatusRenamed, parser.DiffStatusCopied:
if entry.Status == parser.DiffStatusRenamed || entry.Status == parser.DiffStatusCopied {
// Entries with the status 'R' and 'C' output two paths: the old path and the new path.
// Using the params.LineStartNew flag to match the path with the entry's old or new path.
if entry.Path != path && entry.OldPath != path {
continue
}
@ -427,34 +426,79 @@ func (g *Git) DiffCut(
msg := "for renamed files provide the old file name if commenting the old lines"
return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg)
}
default:
if entry.Path != path {
continue
}
} else if entry.Path != path {
// All other statuses output just one path: If the path doesn't match it, proceed with the next entry.
continue
}
rawFileMode := entry.OldFileMode
if params.LineStartNew {
rawFileMode = entry.NewFileMode
}
fileType, _, err := parseTreeNodeMode(rawFileMode)
if err != nil {
return parser.HunkHeader{}, parser.Hunk{},
fmt.Errorf("failed to parse file mode %s for path %s: %w", rawFileMode, path, err)
}
switch fileType {
default:
return parser.HunkHeader{}, parser.Hunk{}, errors.Internal(nil, "unrecognized object type")
case TreeNodeTypeCommit:
msg := "code comment is not allowed on a submodule"
return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg)
case TreeNodeTypeTree:
msg := "code comment is not allowed on a directory"
return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg)
case TreeNodeTypeBlob:
// a blob is what we want
}
var hunkHeader parser.HunkHeader
var hunk parser.Hunk
switch entry.Status {
case parser.DiffStatusRenamed, parser.DiffStatusCopied, parser.DiffStatusModified:
// For modified and renamed compare file blobs directly.
// For modified and renamed compare file blob SHAs directly.
oldSHA = entry.OldBlobSHA
newSHA = entry.NewBlobSHA
case parser.DiffStatusAdded, parser.DiffStatusDeleted:
// For added and deleted compare commits, but with the provided path.
oldSHA = targetRef
newSHA = sourceRef
filePath = entry.Path
hunkHeader, hunk, err = g.diffCutFromHunk(ctx, repoPath, oldSHA, newSHA, params)
case parser.DiffStatusAdded, parser.DiffStatusDeleted, parser.DiffStatusType:
// for added and deleted files read the file content directly
if params.LineStartNew {
hunkHeader, hunk, err = g.diffCutFromBlob(ctx, repoPath, true, entry.NewBlobSHA, params)
} else {
hunkHeader, hunk, err = g.diffCutFromBlob(ctx, repoPath, false, entry.OldBlobSHA, params)
}
default:
return parser.HunkHeader{}, parser.Hunk{},
fmt.Errorf("unrecognized git diff file status=%c for path=%s", entry.Status, path)
}
if err != nil {
return parser.HunkHeader{}, parser.Hunk{},
fmt.Errorf("failed to extract hunk for git diff file status=%c path=%s: %w", entry.Status, path, err)
}
break
// The returned diff hunk will be stored in the DB and will only be used for display, as a reference.
// Therefore, we trim too long lines to protect the system and the DB.
const maxLineLen = 200
parser.LimitLineLen(&hunk.Lines, maxLineLen)
return hunkHeader, hunk, nil
}
if newSHA == "" {
return parser.HunkHeader{}, parser.Hunk{}, errors.NotFound("file %s not found in the diff", path)
}
return parser.HunkHeader{}, parser.Hunk{}, errors.NotFound("path not found")
}
// next pull the diff cut for the requested file
pipeRead, pipeWrite = io.Pipe()
func (g *Git) diffCutFromHunk(
ctx context.Context,
repoPath string,
oldSHA string,
newSHA string,
params parser.DiffCutParams,
) (parser.HunkHeader, parser.Hunk, error) {
pipeRead, pipeWrite := io.Pipe()
stderr := bytes.NewBuffer(nil)
go func() {
var err error
@ -470,11 +514,6 @@ func (g *Git) DiffCut(
command.WithFlag("--unified=100000000"),
command.WithArg(oldSHA),
command.WithArg(newSHA))
if filePath != "" {
cmd.Add(
command.WithFlag("--merge-base"),
command.WithPostSepArg(filePath))
}
err = cmd.Run(ctx,
command.WithDir(repoPath),
@ -495,6 +534,75 @@ func (g *Git) DiffCut(
return diffCutHeader, linesHunk, nil
}
func (g *Git) diffCutFromBlob(
ctx context.Context,
repoPath string,
asAdded bool,
sha string,
params parser.DiffCutParams,
) (parser.HunkHeader, parser.Hunk, error) {
pipeRead, pipeWrite := io.Pipe()
go func() {
var err error
defer func() {
// If running of the command below fails, make the pipe reader also fail with the same error.
_ = pipeWrite.CloseWithError(err)
}()
cmd := command.New("cat-file",
command.WithFlag("-p"),
command.WithArg(sha))
err = cmd.Run(ctx,
command.WithDir(repoPath),
command.WithStdout(pipeWrite))
}()
cutHeader, cut, err := parser.BlobCut(pipeRead, params)
if err != nil {
// Next check if reading the git diff output caused an error
return parser.HunkHeader{}, parser.Hunk{}, err
}
// Convert parser.CutHeader to parser.HunkHeader and parser.Cut to parser.Hunk.
var hunkHeader parser.HunkHeader
var hunk parser.Hunk
if asAdded {
for i := range cut.Lines {
cut.Lines[i] = "+ " + cut.Lines[i]
}
hunkHeader = parser.HunkHeader{
NewLine: cutHeader.Line,
NewSpan: cutHeader.Span,
}
hunk = parser.Hunk{
HunkHeader: parser.HunkHeader{NewLine: cut.Line, NewSpan: cut.Span},
Lines: cut.Lines,
}
} else {
for i := range cut.Lines {
cut.Lines[i] = "- " + cut.Lines[i]
}
hunkHeader = parser.HunkHeader{
OldLine: cutHeader.Line,
OldSpan: cutHeader.Span,
}
hunk = parser.Hunk{
HunkHeader: parser.HunkHeader{OldLine: cut.Line, OldSpan: cut.Span},
Lines: cut.Lines,
}
}
return hunkHeader, hunk, nil
}
func (g *Git) DiffFileName(ctx context.Context,
repoPath string,
baseRef string,

View File

@ -17,7 +17,9 @@ package parser
import (
"bufio"
"errors"
"fmt"
"io"
"unicode/utf8"
)
type DiffFileHeader struct {
@ -239,6 +241,97 @@ again:
return
}
// BlobCut parses raw file and returns lines specified with the parameter.
func BlobCut(r io.Reader, params DiffCutParams) (CutHeader, Cut, error) {
scanner := bufio.NewScanner(r)
var (
err error
lineNumber int
inCut bool
cutStart, cutSpan int
cutLines []string
)
extStart := params.LineStart - params.BeforeLines
extEnd := params.LineEnd + params.AfterLines
linesNeeded := params.LineEnd - params.LineStart + 1
for {
if !scanner.Scan() {
err = scanner.Err()
break
}
lineNumber++
line := scanner.Text()
if !utf8.ValidString(line) {
return CutHeader{}, Cut{}, ErrBinaryFile
}
if lineNumber > extEnd {
break // exceeded the requested line range
}
if lineNumber < extStart {
// not yet in the requested line range
continue
}
if !inCut {
cutStart = lineNumber
inCut = true
}
cutLines = append(cutLines, line)
cutSpan++
if lineNumber >= params.LineStart && lineNumber <= params.LineEnd {
linesNeeded--
}
if len(cutLines) >= params.LineLimit {
break
}
}
if errors.Is(err, bufio.ErrTooLong) {
// By default, the max token size is 65536 (bufio.MaxScanTokenSize).
// If the file contains a line that is longer than this we treat it as a binary file.
return CutHeader{}, Cut{}, ErrBinaryFile
}
if err != nil && !errors.Is(err, io.EOF) {
return CutHeader{}, Cut{}, fmt.Errorf("failed to parse blob cut: %w", err)
}
if !inCut || linesNeeded > 0 {
return CutHeader{}, Cut{}, ErrHunkNotFound
}
// the cut header is hunk-like header (with Line and Span) that describes the requested lines exactly
ch := CutHeader{Line: params.LineStart, Span: params.LineEnd - params.LineStart + 1}
// the cut includes the requested lines and few more lines specified with the BeforeLines and AfterLines.
c := Cut{CutHeader: CutHeader{Line: cutStart, Span: cutSpan}, Lines: cutLines}
return ch, c, nil
}
func LimitLineLen(lines *[]string, maxLen int) {
outer:
for idxLine, line := range *lines {
var l int
for idxRune := range line {
l++
if l > maxLen {
(*lines)[idxLine] = line[:idxRune] + "…" // append the ellipsis to indicate that the line was trimmed.
continue outer
}
}
}
}
type strCircBuf struct {
head int
entries []string

View File

@ -237,6 +237,88 @@ index af7864ba..541cb64f 100644
}
}
func TestBlobCut(t *testing.T) {
const input = `1
2
3
4
5
6`
tests := []struct {
name string
params DiffCutParams
expCutHeader CutHeader
expCut Cut
expError error
}{
{
name: "first 3 lines",
params: DiffCutParams{LineStart: 1, LineEnd: 3, LineLimit: 40},
expCutHeader: CutHeader{Line: 1, Span: 3},
expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 3}, Lines: []string{"1", "2", "3"}},
},
{
name: "last 2 lines",
params: DiffCutParams{LineStart: 5, LineEnd: 6, LineLimit: 40},
expCutHeader: CutHeader{Line: 5, Span: 2},
expCut: Cut{CutHeader: CutHeader{Line: 5, Span: 2}, Lines: []string{"5", "6"}},
},
{
name: "first 3 lines with 1 more",
params: DiffCutParams{LineStart: 1, LineEnd: 3, BeforeLines: 1, AfterLines: 1, LineLimit: 40},
expCutHeader: CutHeader{Line: 1, Span: 3},
expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 4}, Lines: []string{"1", "2", "3", "4"}},
},
{
name: "last 2 lines with 2 more",
params: DiffCutParams{LineStart: 5, LineEnd: 6, BeforeLines: 2, AfterLines: 2, LineLimit: 40},
expCutHeader: CutHeader{Line: 5, Span: 2},
expCut: Cut{CutHeader: CutHeader{Line: 3, Span: 4}, Lines: []string{"3", "4", "5", "6"}},
},
{
name: "mid range",
params: DiffCutParams{LineStart: 2, LineEnd: 4, BeforeLines: 100, AfterLines: 100, LineLimit: 40},
expCutHeader: CutHeader{Line: 2, Span: 3},
expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 6}, Lines: []string{"1", "2", "3", "4", "5", "6"}},
},
{
name: "out of range 1",
params: DiffCutParams{LineStart: 15, LineEnd: 17, LineLimit: 40},
expError: ErrHunkNotFound,
},
{
name: "out of range 2",
params: DiffCutParams{LineStart: 5, LineEnd: 7, BeforeLines: 1, AfterLines: 1, LineLimit: 40},
expError: ErrHunkNotFound,
},
{
name: "limited",
params: DiffCutParams{LineStart: 3, LineEnd: 4, BeforeLines: 1, AfterLines: 1, LineLimit: 3},
expCutHeader: CutHeader{Line: 3, Span: 2},
expCut: Cut{CutHeader: CutHeader{Line: 2, Span: 3}, Lines: []string{"2", "3", "4"}},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
ch, c, err := BlobCut(strings.NewReader(input), test.params)
if err != test.expError { //nolint:errorlint // it's a unit test and no errors are wrapped
t.Errorf("test failed with error: %s", err.Error())
return
}
if want, got := test.expCutHeader, ch; want != got {
t.Errorf("cut header: want=%+v got: %+v", want, got)
}
if want, got := test.expCut, c; !reflect.DeepEqual(want, got) {
t.Errorf("cut: want=%+v got: %+v", want, got)
}
})
}
}
func TestStrCircBuf(t *testing.T) {
tests := []struct {
name string

View File

@ -24,23 +24,29 @@ import (
type DiffStatus byte
const (
DiffStatusModified = 'M'
DiffStatusAdded = 'A'
DiffStatusDeleted = 'D'
DiffStatusRenamed = 'R'
DiffStatusCopied = 'C'
DiffStatusType = 'T'
DiffStatusModified DiffStatus = 'M'
DiffStatusAdded DiffStatus = 'A'
DiffStatusDeleted DiffStatus = 'D'
DiffStatusRenamed DiffStatus = 'R'
DiffStatusCopied DiffStatus = 'C'
DiffStatusType DiffStatus = 'T'
)
type DiffRawFile struct {
OldBlobSHA string
NewBlobSHA string
Status byte
OldPath string
Path string
func (s DiffStatus) String() string {
return fmt.Sprintf("%c", s)
}
var regexpDiffRaw = regexp.MustCompile(`:\d{6} \d{6} ([0-9a-f]+) ([0-9a-f]+) (\w)(\d*)`)
type DiffRawFile struct {
OldFileMode string
NewFileMode string
OldBlobSHA string
NewBlobSHA string
Status DiffStatus
OldPath string
Path string
}
var regexpDiffRaw = regexp.MustCompile(`:(\d{6}) (\d{6}) ([0-9a-f]+) ([0-9a-f]+) (\w)(\d*)`)
// DiffRaw parses raw git diff output (git diff --raw). Each entry (a line) is a changed file.
// The format is:
@ -72,7 +78,7 @@ func DiffRaw(r io.Reader) ([]DiffRawFile, error) {
path = scan.Text()
status := groups[3][0]
status := DiffStatus(groups[5][0])
switch status {
case DiffStatusRenamed, DiffStatusCopied:
if !scan.Scan() {
@ -86,11 +92,13 @@ func DiffRaw(r io.Reader) ([]DiffRawFile, error) {
}
result = append(result, DiffRawFile{
OldBlobSHA: groups[1],
NewBlobSHA: groups[2],
Status: status,
OldPath: oldPath,
Path: path,
OldFileMode: groups[1],
NewFileMode: groups[2],
OldBlobSHA: groups[3],
NewBlobSHA: groups[4],
Status: status,
OldPath: oldPath,
Path: path,
})
}
if err := scan.Err(); err != nil {

View File

@ -14,9 +14,10 @@
package parser
import "errors"
import "github.com/harness/gitness/errors"
var (
ErrSHADoesNotMatch = errors.New("sha does not match")
ErrHunkNotFound = errors.New("hunk not found")
ErrSHADoesNotMatch = errors.InvalidArgument("sha does not match")
ErrHunkNotFound = errors.NotFound("hunk not found")
ErrBinaryFile = errors.InvalidArgument("can't handle a binary file")
)

View File

@ -33,6 +33,16 @@ type HunkHeader struct {
Text string
}
type Cut struct {
CutHeader
Lines []string
}
type CutHeader struct {
Line int
Span int
}
var regExpHunkHeader = regexp.MustCompile(`^@@ -([0-9]+)(,([0-9]+))? \+([0-9]+)(,([0-9]+))? @@( (.+))?$`)
func (h *HunkHeader) IsZero() bool {