feat(git_patch_parser): add Git patch parsing functionality
- Introduce a data model for structured representation of Git patches - Implement a parser to extract key fields from patch text - Add unit tests to ensure robustness and validate edge cases
This commit is contained in:
42
src/patchman/utils/git_patch_parser/models.py
Normal file
42
src/patchman/utils/git_patch_parser/models.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class GitPatch(BaseModel):
|
||||
"""Represents a parsed Git patch with structured fields.
|
||||
|
||||
Attributes:
|
||||
commit_hash (str): 40-character SHA-1 commit hash from the 'From' header.
|
||||
author (str): Author line from the patch header (e.g., 'Max <mail@example.com>').
|
||||
date (str): Commit date in RFC-2822 format (e.g., 'Wed, 23 Jul 2025 12:34:56 +0200').
|
||||
subject (str): Commit subject line from the Subject header.
|
||||
description (Optional[str]): Optional description text from the commit body (between header and diff).
|
||||
diff (str): Complete patch diff (starting with 'diff --git' or '--- a/...').
|
||||
"""
|
||||
commit_hash: str = Field(
|
||||
...,
|
||||
min_length=40,
|
||||
max_length=40,
|
||||
pattern="^[0-9a-f]{40}$",
|
||||
description="40-character SHA-1 commit hash from the 'From' header"
|
||||
)
|
||||
author: str = Field(
|
||||
...,
|
||||
description="Author line from the patch header (e.g. 'Max <mail@example.com>')"
|
||||
)
|
||||
date: str = Field(
|
||||
...,
|
||||
description="Commit date in RFC-2822 format, e.g. 'Wed, 23 Jul 2025 12:34:56 +0200'"
|
||||
)
|
||||
subject: str = Field(
|
||||
...,
|
||||
description="Commit subject line from the Subject header"
|
||||
)
|
||||
description: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Optional description text from the commit body (between header and diff)"
|
||||
)
|
||||
diff: str = Field(
|
||||
...,
|
||||
description="Complete patch diff (starting with 'diff --git' or '--- a/...')"
|
||||
)
|
81
src/patchman/utils/git_patch_parser/parser.py
Normal file
81
src/patchman/utils/git_patch_parser/parser.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import re
|
||||
from .models import GitPatch
|
||||
|
||||
# Match the commit hash from the 'From' line at the beginning of the patch
|
||||
RE_COMMIT_HASH = re.compile(
|
||||
r'^From\s+(?P<hash>[0-9a-f]{40})\s',
|
||||
flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# Extract author information from the 'From:' header line
|
||||
RE_AUTHOR = re.compile(r'^From:\s+(?P<author>.+)$', re.MULTILINE)
|
||||
|
||||
# Extract commit date from the 'Date:' header line
|
||||
RE_DATE = re.compile(r'^Date:\s+(?P<date>.+)$', re.MULTILINE)
|
||||
|
||||
# Extract commit subject from the 'Subject:' header line
|
||||
RE_SUBJECT = re.compile(r'^Subject:\s+(?P<subject>.+)$', re.MULTILINE)
|
||||
|
||||
# Extract commit description: everything between the first empty line after headers
|
||||
# and the first occurrence of "diff --git" or "--- a/"
|
||||
RE_DESCRIPTION = re.compile(
|
||||
r'\n\n(?P<desc>.*?)(?=\n(?:diff --git|---\s[a-b]/))',
|
||||
flags=re.DOTALL
|
||||
)
|
||||
|
||||
# Extract the diff content: from the first "diff --git" or "--- a/" to the end of file
|
||||
RE_DIFF = re.compile(
|
||||
r'\n(?P<diff>(?:diff --git|---\s[a-b]/).*?)\Z',
|
||||
flags=re.DOTALL
|
||||
)
|
||||
|
||||
|
||||
def parse_git_patch(patch_text: str) -> GitPatch:
|
||||
"""Parse a git-format-patch into a GitPatch dataclass.
|
||||
|
||||
Args:
|
||||
patch_text (str): The raw text content of a git format-patch file.
|
||||
|
||||
Returns:
|
||||
GitPatch: A structured representation of the patch.
|
||||
|
||||
Raises:
|
||||
ValueError: If required fields (commit hash, author, date, subject) are missing.
|
||||
"""
|
||||
# Search for required fields with null checks
|
||||
commit_match = RE_COMMIT_HASH.search(patch_text)
|
||||
if not commit_match:
|
||||
raise ValueError("Commit hash not found in patch")
|
||||
commit_hash = commit_match.group('hash')
|
||||
|
||||
author_match = RE_AUTHOR.search(patch_text)
|
||||
if not author_match:
|
||||
raise ValueError("Author not found in patch")
|
||||
author = author_match.group('author')
|
||||
|
||||
date_match = RE_DATE.search(patch_text)
|
||||
if not date_match:
|
||||
raise ValueError("Date not found in patch")
|
||||
date = date_match.group('date')
|
||||
|
||||
subject_match = RE_SUBJECT.search(patch_text)
|
||||
if not subject_match:
|
||||
raise ValueError("Subject not found in patch")
|
||||
subject = subject_match.group('subject')
|
||||
|
||||
# Extract optional fields
|
||||
desc_match = RE_DESCRIPTION.search(patch_text)
|
||||
diff_match = RE_DIFF.search(patch_text)
|
||||
|
||||
# Process optional fields with fallback to empty strings
|
||||
description = desc_match.group('desc').rstrip() if desc_match else ''
|
||||
diff = diff_match.group('diff') if diff_match else ''
|
||||
|
||||
return GitPatch(
|
||||
commit_hash=commit_hash,
|
||||
author=author,
|
||||
date=date,
|
||||
subject=subject,
|
||||
description=description,
|
||||
diff=diff,
|
||||
)
|
78
tests/test_parser.py
Normal file
78
tests/test_parser.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import unittest
|
||||
from patchman.utils.git_patch_parser.parser import parse_git_patch
|
||||
from patchman.utils.git_patch_parser.models import GitPatch
|
||||
|
||||
class TestGitPatchParser(unittest.TestCase):
|
||||
def test_parse_valid_patch(self):
|
||||
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
|
||||
From: Max Mustermann <max@example.com>
|
||||
Date: Wed, 23 Jul 2025 12:34:56 +0200
|
||||
Subject: [PATCH] Dein Patch-Titel
|
||||
|
||||
Dies ist eine Beschreibung des Patches.
|
||||
|
||||
diff --git a/file.txt b/file.txt
|
||||
index 83db48f..f735c3b 100644
|
||||
--- a/file.txt
|
||||
+++ b/file.txt
|
||||
@@ -1 +1 @@
|
||||
-Hello World
|
||||
+Hello Patchman
|
||||
"""
|
||||
expected_patch = GitPatch(
|
||||
commit_hash="0123456789abcdef1234567890abcdef12345678",
|
||||
author="Max Mustermann <max@example.com>",
|
||||
date="Wed, 23 Jul 2025 12:34:56 +0200",
|
||||
subject="[PATCH] Dein Patch-Titel",
|
||||
description="Dies ist eine Beschreibung des Patches.",
|
||||
diff="diff --git a/file.txt b/file.txt\nindex 83db48f..f735c3b 100644\n--- a/file.txt\n+++ b/file.txt\n@@ -1 +1 @@\n-Hello World\n+Hello Patchman\n"
|
||||
)
|
||||
parsed_patch = parse_git_patch(patch_text)
|
||||
self.assertEqual(parsed_patch, expected_patch)
|
||||
|
||||
def test_parse_invalid_patch(self):
|
||||
patch_text = "Invalid patch content"
|
||||
with self.assertRaises(ValueError) as context:
|
||||
parse_git_patch(patch_text)
|
||||
self.assertIn("Commit hash not found", str(context.exception))
|
||||
|
||||
def test_parse_patch_missing_author(self):
|
||||
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
|
||||
Date: Wed, 23 Jul 2025 12:34:56 +0200
|
||||
Subject: [PATCH] Test"""
|
||||
with self.assertRaises(ValueError) as context:
|
||||
parse_git_patch(patch_text)
|
||||
self.assertIn("Author not found", str(context.exception))
|
||||
|
||||
def test_parse_patch_missing_date(self):
|
||||
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
|
||||
From: Test Author <test@example.com>
|
||||
Subject: [PATCH] Test"""
|
||||
with self.assertRaises(ValueError) as context:
|
||||
parse_git_patch(patch_text)
|
||||
self.assertIn("Date not found", str(context.exception))
|
||||
|
||||
def test_parse_patch_missing_subject(self):
|
||||
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
|
||||
From: Test Author <test@example.com>
|
||||
Date: Wed, 23 Jul 2025 12:34:56 +0200"""
|
||||
with self.assertRaises(ValueError) as context:
|
||||
parse_git_patch(patch_text)
|
||||
self.assertIn("Subject not found", str(context.exception))
|
||||
|
||||
def test_parse_patch_without_description_and_diff(self):
|
||||
patch_text = """From 0123456789abcdef1234567890abcdef12345678 Mon Sep 17 00:00:00 2001
|
||||
From: Test Author <test@example.com>
|
||||
Date: Wed, 23 Jul 2025 12:34:56 +0200
|
||||
Subject: [PATCH] Minimal patch"""
|
||||
|
||||
parsed_patch = parse_git_patch(patch_text)
|
||||
self.assertEqual(parsed_patch.commit_hash, "0123456789abcdef1234567890abcdef12345678")
|
||||
self.assertEqual(parsed_patch.author, "Test Author <test@example.com>")
|
||||
self.assertEqual(parsed_patch.date, "Wed, 23 Jul 2025 12:34:56 +0200")
|
||||
self.assertEqual(parsed_patch.subject, "[PATCH] Minimal patch")
|
||||
self.assertEqual(parsed_patch.description, "")
|
||||
self.assertEqual(parsed_patch.diff, "")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Reference in New Issue
Block a user