diff --git a/git/api/diff.go b/git/api/diff.go index 80e57065b..937641c25 100644 --- a/git/api/diff.go +++ b/git/api/diff.go @@ -407,14 +407,13 @@ func (g *Git) DiffCut( return parser.HunkHeader{}, parser.Hunk{}, fmt.Errorf("failed to find the list of changed files: %w", err) } - var ( - oldSHA, newSHA string - filePath string - ) + var oldSHA, newSHA string for _, entry := range diffEntries { - switch entry.Status { - case parser.DiffStatusRenamed, parser.DiffStatusCopied: + if entry.Status == parser.DiffStatusRenamed || entry.Status == parser.DiffStatusCopied { + // Entries with the status 'R' and 'C' output two paths: the old path and the new path. + // Using the params.LineStartNew flag to match the path with the entry's old or new path. + if entry.Path != path && entry.OldPath != path { continue } @@ -427,34 +426,79 @@ func (g *Git) DiffCut( msg := "for renamed files provide the old file name if commenting the old lines" return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg) } - default: - if entry.Path != path { - continue - } + } else if entry.Path != path { + // All other statuses output just one path: If the path doesn't match it, proceed with the next entry. + continue } + rawFileMode := entry.OldFileMode + if params.LineStartNew { + rawFileMode = entry.NewFileMode + } + + fileType, _, err := parseTreeNodeMode(rawFileMode) + if err != nil { + return parser.HunkHeader{}, parser.Hunk{}, + fmt.Errorf("failed to parse file mode %s for path %s: %w", rawFileMode, path, err) + } + + switch fileType { + default: + return parser.HunkHeader{}, parser.Hunk{}, errors.Internal(nil, "unrecognized object type") + case TreeNodeTypeCommit: + msg := "code comment is not allowed on a submodule" + return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg) + case TreeNodeTypeTree: + msg := "code comment is not allowed on a directory" + return parser.HunkHeader{}, parser.Hunk{}, errors.InvalidArgument(msg) + case TreeNodeTypeBlob: + // a blob is what we want + } + + var hunkHeader parser.HunkHeader + var hunk parser.Hunk + switch entry.Status { case parser.DiffStatusRenamed, parser.DiffStatusCopied, parser.DiffStatusModified: - // For modified and renamed compare file blobs directly. + // For modified and renamed compare file blob SHAs directly. oldSHA = entry.OldBlobSHA newSHA = entry.NewBlobSHA - case parser.DiffStatusAdded, parser.DiffStatusDeleted: - // For added and deleted compare commits, but with the provided path. - oldSHA = targetRef - newSHA = sourceRef - filePath = entry.Path + hunkHeader, hunk, err = g.diffCutFromHunk(ctx, repoPath, oldSHA, newSHA, params) + case parser.DiffStatusAdded, parser.DiffStatusDeleted, parser.DiffStatusType: + // for added and deleted files read the file content directly + if params.LineStartNew { + hunkHeader, hunk, err = g.diffCutFromBlob(ctx, repoPath, true, entry.NewBlobSHA, params) + } else { + hunkHeader, hunk, err = g.diffCutFromBlob(ctx, repoPath, false, entry.OldBlobSHA, params) + } + default: + return parser.HunkHeader{}, parser.Hunk{}, + fmt.Errorf("unrecognized git diff file status=%c for path=%s", entry.Status, path) + } + if err != nil { + return parser.HunkHeader{}, parser.Hunk{}, + fmt.Errorf("failed to extract hunk for git diff file status=%c path=%s: %w", entry.Status, path, err) } - break + // The returned diff hunk will be stored in the DB and will only be used for display, as a reference. + // Therefore, we trim too long lines to protect the system and the DB. + const maxLineLen = 200 + parser.LimitLineLen(&hunk.Lines, maxLineLen) + + return hunkHeader, hunk, nil } - if newSHA == "" { - return parser.HunkHeader{}, parser.Hunk{}, errors.NotFound("file %s not found in the diff", path) - } + return parser.HunkHeader{}, parser.Hunk{}, errors.NotFound("path not found") +} - // next pull the diff cut for the requested file - - pipeRead, pipeWrite = io.Pipe() +func (g *Git) diffCutFromHunk( + ctx context.Context, + repoPath string, + oldSHA string, + newSHA string, + params parser.DiffCutParams, +) (parser.HunkHeader, parser.Hunk, error) { + pipeRead, pipeWrite := io.Pipe() stderr := bytes.NewBuffer(nil) go func() { var err error @@ -470,11 +514,6 @@ func (g *Git) DiffCut( command.WithFlag("--unified=100000000"), command.WithArg(oldSHA), command.WithArg(newSHA)) - if filePath != "" { - cmd.Add( - command.WithFlag("--merge-base"), - command.WithPostSepArg(filePath)) - } err = cmd.Run(ctx, command.WithDir(repoPath), @@ -495,6 +534,75 @@ func (g *Git) DiffCut( return diffCutHeader, linesHunk, nil } +func (g *Git) diffCutFromBlob( + ctx context.Context, + repoPath string, + asAdded bool, + sha string, + params parser.DiffCutParams, +) (parser.HunkHeader, parser.Hunk, error) { + pipeRead, pipeWrite := io.Pipe() + go func() { + var err error + + defer func() { + // If running of the command below fails, make the pipe reader also fail with the same error. + _ = pipeWrite.CloseWithError(err) + }() + + cmd := command.New("cat-file", + command.WithFlag("-p"), + command.WithArg(sha)) + + err = cmd.Run(ctx, + command.WithDir(repoPath), + command.WithStdout(pipeWrite)) + }() + + cutHeader, cut, err := parser.BlobCut(pipeRead, params) + if err != nil { + // Next check if reading the git diff output caused an error + return parser.HunkHeader{}, parser.Hunk{}, err + } + + // Convert parser.CutHeader to parser.HunkHeader and parser.Cut to parser.Hunk. + + var hunkHeader parser.HunkHeader + var hunk parser.Hunk + + if asAdded { + for i := range cut.Lines { + cut.Lines[i] = "+ " + cut.Lines[i] + } + + hunkHeader = parser.HunkHeader{ + NewLine: cutHeader.Line, + NewSpan: cutHeader.Span, + } + + hunk = parser.Hunk{ + HunkHeader: parser.HunkHeader{NewLine: cut.Line, NewSpan: cut.Span}, + Lines: cut.Lines, + } + } else { + for i := range cut.Lines { + cut.Lines[i] = "- " + cut.Lines[i] + } + + hunkHeader = parser.HunkHeader{ + OldLine: cutHeader.Line, + OldSpan: cutHeader.Span, + } + + hunk = parser.Hunk{ + HunkHeader: parser.HunkHeader{OldLine: cut.Line, OldSpan: cut.Span}, + Lines: cut.Lines, + } + } + + return hunkHeader, hunk, nil +} + func (g *Git) DiffFileName(ctx context.Context, repoPath string, baseRef string, diff --git a/git/parser/diff_cut.go b/git/parser/diff_cut.go index e1c0c152c..fdc6e8c9c 100644 --- a/git/parser/diff_cut.go +++ b/git/parser/diff_cut.go @@ -17,7 +17,9 @@ package parser import ( "bufio" "errors" + "fmt" "io" + "unicode/utf8" ) type DiffFileHeader struct { @@ -239,6 +241,97 @@ again: return } +// BlobCut parses raw file and returns lines specified with the parameter. +func BlobCut(r io.Reader, params DiffCutParams) (CutHeader, Cut, error) { + scanner := bufio.NewScanner(r) + + var ( + err error + lineNumber int + inCut bool + cutStart, cutSpan int + cutLines []string + ) + + extStart := params.LineStart - params.BeforeLines + extEnd := params.LineEnd + params.AfterLines + linesNeeded := params.LineEnd - params.LineStart + 1 + + for { + if !scanner.Scan() { + err = scanner.Err() + break + } + + lineNumber++ + line := scanner.Text() + + if !utf8.ValidString(line) { + return CutHeader{}, Cut{}, ErrBinaryFile + } + + if lineNumber > extEnd { + break // exceeded the requested line range + } + + if lineNumber < extStart { + // not yet in the requested line range + continue + } + + if !inCut { + cutStart = lineNumber + inCut = true + } + cutLines = append(cutLines, line) + cutSpan++ + + if lineNumber >= params.LineStart && lineNumber <= params.LineEnd { + linesNeeded-- + } + + if len(cutLines) >= params.LineLimit { + break + } + } + + if errors.Is(err, bufio.ErrTooLong) { + // By default, the max token size is 65536 (bufio.MaxScanTokenSize). + // If the file contains a line that is longer than this we treat it as a binary file. + return CutHeader{}, Cut{}, ErrBinaryFile + } + + if err != nil && !errors.Is(err, io.EOF) { + return CutHeader{}, Cut{}, fmt.Errorf("failed to parse blob cut: %w", err) + } + + if !inCut || linesNeeded > 0 { + return CutHeader{}, Cut{}, ErrHunkNotFound + } + + // the cut header is hunk-like header (with Line and Span) that describes the requested lines exactly + ch := CutHeader{Line: params.LineStart, Span: params.LineEnd - params.LineStart + 1} + + // the cut includes the requested lines and few more lines specified with the BeforeLines and AfterLines. + c := Cut{CutHeader: CutHeader{Line: cutStart, Span: cutSpan}, Lines: cutLines} + + return ch, c, nil +} + +func LimitLineLen(lines *[]string, maxLen int) { +outer: + for idxLine, line := range *lines { + var l int + for idxRune := range line { + l++ + if l > maxLen { + (*lines)[idxLine] = line[:idxRune] + "…" // append the ellipsis to indicate that the line was trimmed. + continue outer + } + } + } +} + type strCircBuf struct { head int entries []string diff --git a/git/parser/diff_cut_test.go b/git/parser/diff_cut_test.go index 9a6aaf678..25f2b11e6 100644 --- a/git/parser/diff_cut_test.go +++ b/git/parser/diff_cut_test.go @@ -237,6 +237,88 @@ index af7864ba..541cb64f 100644 } } +func TestBlobCut(t *testing.T) { + const input = `1 +2 +3 +4 +5 +6` + + tests := []struct { + name string + params DiffCutParams + expCutHeader CutHeader + expCut Cut + expError error + }{ + { + name: "first 3 lines", + params: DiffCutParams{LineStart: 1, LineEnd: 3, LineLimit: 40}, + expCutHeader: CutHeader{Line: 1, Span: 3}, + expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 3}, Lines: []string{"1", "2", "3"}}, + }, + { + name: "last 2 lines", + params: DiffCutParams{LineStart: 5, LineEnd: 6, LineLimit: 40}, + expCutHeader: CutHeader{Line: 5, Span: 2}, + expCut: Cut{CutHeader: CutHeader{Line: 5, Span: 2}, Lines: []string{"5", "6"}}, + }, + { + name: "first 3 lines with 1 more", + params: DiffCutParams{LineStart: 1, LineEnd: 3, BeforeLines: 1, AfterLines: 1, LineLimit: 40}, + expCutHeader: CutHeader{Line: 1, Span: 3}, + expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 4}, Lines: []string{"1", "2", "3", "4"}}, + }, + { + name: "last 2 lines with 2 more", + params: DiffCutParams{LineStart: 5, LineEnd: 6, BeforeLines: 2, AfterLines: 2, LineLimit: 40}, + expCutHeader: CutHeader{Line: 5, Span: 2}, + expCut: Cut{CutHeader: CutHeader{Line: 3, Span: 4}, Lines: []string{"3", "4", "5", "6"}}, + }, + { + name: "mid range", + params: DiffCutParams{LineStart: 2, LineEnd: 4, BeforeLines: 100, AfterLines: 100, LineLimit: 40}, + expCutHeader: CutHeader{Line: 2, Span: 3}, + expCut: Cut{CutHeader: CutHeader{Line: 1, Span: 6}, Lines: []string{"1", "2", "3", "4", "5", "6"}}, + }, + { + name: "out of range 1", + params: DiffCutParams{LineStart: 15, LineEnd: 17, LineLimit: 40}, + expError: ErrHunkNotFound, + }, + { + name: "out of range 2", + params: DiffCutParams{LineStart: 5, LineEnd: 7, BeforeLines: 1, AfterLines: 1, LineLimit: 40}, + expError: ErrHunkNotFound, + }, + { + name: "limited", + params: DiffCutParams{LineStart: 3, LineEnd: 4, BeforeLines: 1, AfterLines: 1, LineLimit: 3}, + expCutHeader: CutHeader{Line: 3, Span: 2}, + expCut: Cut{CutHeader: CutHeader{Line: 2, Span: 3}, Lines: []string{"2", "3", "4"}}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ch, c, err := BlobCut(strings.NewReader(input), test.params) + if err != test.expError { //nolint:errorlint // it's a unit test and no errors are wrapped + t.Errorf("test failed with error: %s", err.Error()) + return + } + + if want, got := test.expCutHeader, ch; want != got { + t.Errorf("cut header: want=%+v got: %+v", want, got) + } + + if want, got := test.expCut, c; !reflect.DeepEqual(want, got) { + t.Errorf("cut: want=%+v got: %+v", want, got) + } + }) + } +} + func TestStrCircBuf(t *testing.T) { tests := []struct { name string diff --git a/git/parser/diff_raw.go b/git/parser/diff_raw.go index 48b1c52c7..b42166c29 100644 --- a/git/parser/diff_raw.go +++ b/git/parser/diff_raw.go @@ -24,23 +24,29 @@ import ( type DiffStatus byte const ( - DiffStatusModified = 'M' - DiffStatusAdded = 'A' - DiffStatusDeleted = 'D' - DiffStatusRenamed = 'R' - DiffStatusCopied = 'C' - DiffStatusType = 'T' + DiffStatusModified DiffStatus = 'M' + DiffStatusAdded DiffStatus = 'A' + DiffStatusDeleted DiffStatus = 'D' + DiffStatusRenamed DiffStatus = 'R' + DiffStatusCopied DiffStatus = 'C' + DiffStatusType DiffStatus = 'T' ) -type DiffRawFile struct { - OldBlobSHA string - NewBlobSHA string - Status byte - OldPath string - Path string +func (s DiffStatus) String() string { + return fmt.Sprintf("%c", s) } -var regexpDiffRaw = regexp.MustCompile(`:\d{6} \d{6} ([0-9a-f]+) ([0-9a-f]+) (\w)(\d*)`) +type DiffRawFile struct { + OldFileMode string + NewFileMode string + OldBlobSHA string + NewBlobSHA string + Status DiffStatus + OldPath string + Path string +} + +var regexpDiffRaw = regexp.MustCompile(`:(\d{6}) (\d{6}) ([0-9a-f]+) ([0-9a-f]+) (\w)(\d*)`) // DiffRaw parses raw git diff output (git diff --raw). Each entry (a line) is a changed file. // The format is: @@ -72,7 +78,7 @@ func DiffRaw(r io.Reader) ([]DiffRawFile, error) { path = scan.Text() - status := groups[3][0] + status := DiffStatus(groups[5][0]) switch status { case DiffStatusRenamed, DiffStatusCopied: if !scan.Scan() { @@ -86,11 +92,13 @@ func DiffRaw(r io.Reader) ([]DiffRawFile, error) { } result = append(result, DiffRawFile{ - OldBlobSHA: groups[1], - NewBlobSHA: groups[2], - Status: status, - OldPath: oldPath, - Path: path, + OldFileMode: groups[1], + NewFileMode: groups[2], + OldBlobSHA: groups[3], + NewBlobSHA: groups[4], + Status: status, + OldPath: oldPath, + Path: path, }) } if err := scan.Err(); err != nil { diff --git a/git/parser/errors.go b/git/parser/errors.go index 1111fbc7b..e67a16803 100644 --- a/git/parser/errors.go +++ b/git/parser/errors.go @@ -14,9 +14,10 @@ package parser -import "errors" +import "github.com/harness/gitness/errors" var ( - ErrSHADoesNotMatch = errors.New("sha does not match") - ErrHunkNotFound = errors.New("hunk not found") + ErrSHADoesNotMatch = errors.InvalidArgument("sha does not match") + ErrHunkNotFound = errors.NotFound("hunk not found") + ErrBinaryFile = errors.InvalidArgument("can't handle a binary file") ) diff --git a/git/parser/hunk.go b/git/parser/hunk.go index a02dab33a..5d98ec5c0 100644 --- a/git/parser/hunk.go +++ b/git/parser/hunk.go @@ -33,6 +33,16 @@ type HunkHeader struct { Text string } +type Cut struct { + CutHeader + Lines []string +} + +type CutHeader struct { + Line int + Span int +} + var regExpHunkHeader = regexp.MustCompile(`^@@ -([0-9]+)(,([0-9]+))? \+([0-9]+)(,([0-9]+))? @@( (.+))?$`) func (h *HunkHeader) IsZero() bool {