feat(git_patch_parser): add Git patch parsing functionality
Some checks failed
Auto Changelog & (Release) / release (push) Successful in 14s
Build and Publish nightly package / build-and-publish (push) Failing after 1m29s

- Introduce a data model for structured representation of Git patches
- Implement a parser to extract key fields from patch text
- Add unit tests to ensure robustness and validate edge cases
This commit is contained in:
2025-07-23 13:39:10 +02:00
parent f34c5e7bcb
commit d3b4dc11fb
3 changed files with 201 additions and 0 deletions

View File

@@ -0,0 +1,42 @@
from pydantic import BaseModel, Field
from typing import Optional
class GitPatch(BaseModel):
"""Represents a parsed Git patch with structured fields.
Attributes:
commit_hash (str): 40-character SHA-1 commit hash from the 'From' header.
author (str): Author line from the patch header (e.g., 'Max <mail@example.com>').
date (str): Commit date in RFC-2822 format (e.g., 'Wed, 23 Jul 2025 12:34:56 +0200').
subject (str): Commit subject line from the Subject header.
description (Optional[str]): Optional description text from the commit body (between header and diff).
diff (str): Complete patch diff (starting with 'diff --git' or '--- a/...').
"""
commit_hash: str = Field(
...,
min_length=40,
max_length=40,
pattern="^[0-9a-f]{40}$",
description="40-character SHA-1 commit hash from the 'From' header"
)
author: str = Field(
...,
description="Author line from the patch header (e.g. 'Max <mail@example.com>')"
)
date: str = Field(
...,
description="Commit date in RFC-2822 format, e.g. 'Wed, 23 Jul 2025 12:34:56 +0200'"
)
subject: str = Field(
...,
description="Commit subject line from the Subject header"
)
description: Optional[str] = Field(
default=None,
description="Optional description text from the commit body (between header and diff)"
)
diff: str = Field(
...,
description="Complete patch diff (starting with 'diff --git' or '--- a/...')"
)

View File

@@ -0,0 +1,81 @@
import re
from .models import GitPatch
# Match the commit hash from the 'From' line at the beginning of the patch
RE_COMMIT_HASH = re.compile(
r'^From\s+(?P<hash>[0-9a-f]{40})\s',
flags=re.MULTILINE
)
# Extract author information from the 'From:' header line
RE_AUTHOR = re.compile(r'^From:\s+(?P<author>.+)$', re.MULTILINE)
# Extract commit date from the 'Date:' header line
RE_DATE = re.compile(r'^Date:\s+(?P<date>.+)$', re.MULTILINE)
# Extract commit subject from the 'Subject:' header line
RE_SUBJECT = re.compile(r'^Subject:\s+(?P<subject>.+)$', re.MULTILINE)
# Extract commit description: everything between the first empty line after headers
# and the first occurrence of "diff --git" or "--- a/"
RE_DESCRIPTION = re.compile(
r'\n\n(?P<desc>.*?)(?=\n(?:diff --git|---\s[a-b]/))',
flags=re.DOTALL
)
# Extract the diff content: from the first "diff --git" or "--- a/" to the end of file
RE_DIFF = re.compile(
r'\n(?P<diff>(?:diff --git|---\s[a-b]/).*?)\Z',
flags=re.DOTALL
)
def parse_git_patch(patch_text: str) -> GitPatch:
"""Parse a git-format-patch into a GitPatch dataclass.
Args:
patch_text (str): The raw text content of a git format-patch file.
Returns:
GitPatch: A structured representation of the patch.
Raises:
ValueError: If required fields (commit hash, author, date, subject) are missing.
"""
# Search for required fields with null checks
commit_match = RE_COMMIT_HASH.search(patch_text)
if not commit_match:
raise ValueError("Commit hash not found in patch")
commit_hash = commit_match.group('hash')
author_match = RE_AUTHOR.search(patch_text)
if not author_match:
raise ValueError("Author not found in patch")
author = author_match.group('author')
date_match = RE_DATE.search(patch_text)
if not date_match:
raise ValueError("Date not found in patch")
date = date_match.group('date')
subject_match = RE_SUBJECT.search(patch_text)
if not subject_match:
raise ValueError("Subject not found in patch")
subject = subject_match.group('subject')
# Extract optional fields
desc_match = RE_DESCRIPTION.search(patch_text)
diff_match = RE_DIFF.search(patch_text)
# Process optional fields with fallback to empty strings
description = desc_match.group('desc').rstrip() if desc_match else ''
diff = diff_match.group('diff') if diff_match else ''
return GitPatch(
commit_hash=commit_hash,
author=author,
date=date,
subject=subject,
description=description,
diff=diff,
)

78
tests/test_parser.py Normal file
View File

@@ -0,0 +1,78 @@
import unittest
from patchman.utils.git_patch_parser.parser import parse_git_patch
from patchman.utils.git_patch_parser.models import GitPatch
class TestGitPatchParser(unittest.TestCase):
def test_parse_valid_patch(self):
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
From: Max Mustermann <max@example.com>
Date: Wed, 23 Jul 2025 12:34:56 +0200
Subject: [PATCH] Dein Patch-Titel
Dies ist eine Beschreibung des Patches.
diff --git a/file.txt b/file.txt
index 83db48f..f735c3b 100644
--- a/file.txt
+++ b/file.txt
@@ -1 +1 @@
-Hello World
+Hello Patchman
"""
expected_patch = GitPatch(
commit_hash="0123456789abcdef1234567890abcdef12345678",
author="Max Mustermann <max@example.com>",
date="Wed, 23 Jul 2025 12:34:56 +0200",
subject="[PATCH] Dein Patch-Titel",
description="Dies ist eine Beschreibung des Patches.",
diff="diff --git a/file.txt b/file.txt\nindex 83db48f..f735c3b 100644\n--- a/file.txt\n+++ b/file.txt\n@@ -1 +1 @@\n-Hello World\n+Hello Patchman\n"
)
parsed_patch = parse_git_patch(patch_text)
self.assertEqual(parsed_patch, expected_patch)
def test_parse_invalid_patch(self):
patch_text = "Invalid patch content"
with self.assertRaises(ValueError) as context:
parse_git_patch(patch_text)
self.assertIn("Commit hash not found", str(context.exception))
def test_parse_patch_missing_author(self):
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
Date: Wed, 23 Jul 2025 12:34:56 +0200
Subject: [PATCH] Test"""
with self.assertRaises(ValueError) as context:
parse_git_patch(patch_text)
self.assertIn("Author not found", str(context.exception))
def test_parse_patch_missing_date(self):
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
From: Test Author <test@example.com>
Subject: [PATCH] Test"""
with self.assertRaises(ValueError) as context:
parse_git_patch(patch_text)
self.assertIn("Date not found", str(context.exception))
def test_parse_patch_missing_subject(self):
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
From: Test Author <test@example.com>
Date: Wed, 23 Jul 2025 12:34:56 +0200"""
with self.assertRaises(ValueError) as context:
parse_git_patch(patch_text)
self.assertIn("Subject not found", str(context.exception))
def test_parse_patch_without_description_and_diff(self):
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
From: Test Author <test@example.com>
Date: Wed, 23 Jul 2025 12:34:56 +0200
Subject: [PATCH] Minimal patch"""
parsed_patch = parse_git_patch(patch_text)
self.assertEqual(parsed_patch.commit_hash, "0123456789abcdef1234567890abcdef12345678")
self.assertEqual(parsed_patch.author, "Test Author <test@example.com>")
self.assertEqual(parsed_patch.date, "Wed, 23 Jul 2025 12:34:56 +0200")
self.assertEqual(parsed_patch.subject, "[PATCH] Minimal patch")
self.assertEqual(parsed_patch.description, "")
self.assertEqual(parsed_patch.diff, "")
if __name__ == '__main__':
unittest.main()