Skip to content

Commit 8be6f56

Browse files
refactor: rename clone to clone_repo and consolidate schema & utility modules (#237)
* refactor: rename clone to clone_repo and consolidate schema & utility modules
1 parent 33f4571 commit 8be6f56

19 files changed

+300
-312
lines changed

src/gitingest/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""Gitingest: A package for ingesting data from Git repositories."""
22

3-
from gitingest.cloning import clone
3+
from gitingest.cloning import clone_repo
44
from gitingest.entrypoint import ingest, ingest_async
55
from gitingest.ingestion import ingest_query
66
from gitingest.query_parsing import parse_query
77

8-
__all__ = ["ingest_query", "clone", "parse_query", "ingest", "ingest_async"]
8+
__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]

src/gitingest/cloning.py

+8-151
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
"""This module contains functions for cloning a Git repository to a local path."""
22

3-
import asyncio
43
import os
54
from pathlib import Path
6-
from typing import List, Optional, Tuple
5+
from typing import Optional
76

8-
from gitingest.ingestion_schema import CloneConfig
7+
from gitingest.schemas import CloneConfig
8+
from gitingest.utils.git_utils import check_repo_exists, ensure_git_installed, run_command
99
from gitingest.utils.timeout_wrapper import async_timeout
1010

1111
TIMEOUT: int = 60
1212

1313

1414
@async_timeout(TIMEOUT)
15-
async def clone(config: CloneConfig) -> None:
15+
async def clone_repo(config: CloneConfig) -> None:
1616
"""
1717
Clone a repository to a local path based on the provided configuration.
1818
@@ -47,7 +47,7 @@ async def clone(config: CloneConfig) -> None:
4747
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc
4848

4949
# Check if the repository exists
50-
if not await _check_repo_exists(url):
50+
if not await check_repo_exists(url):
5151
raise ValueError("Repository not found, make sure it is public")
5252

5353
clone_cmd = ["git", "clone", "--single-branch"]
@@ -64,7 +64,8 @@ async def clone(config: CloneConfig) -> None:
6464
clone_cmd += [url, local_path]
6565

6666
# Clone the repository
67-
await _run_command(*clone_cmd)
67+
await ensure_git_installed()
68+
await run_command(*clone_cmd)
6869

6970
if commit or partial_clone:
7071
checkout_cmd = ["git", "-C", local_path]
@@ -81,148 +82,4 @@ async def clone(config: CloneConfig) -> None:
8182
checkout_cmd += ["checkout", commit]
8283

8384
# Check out the specific commit and/or subpath
84-
await _run_command(*checkout_cmd)
85-
86-
87-
async def _check_repo_exists(url: str) -> bool:
88-
"""
89-
Check if a Git repository exists at the provided URL.
90-
91-
Parameters
92-
----------
93-
url : str
94-
The URL of the Git repository to check.
95-
Returns
96-
-------
97-
bool
98-
True if the repository exists, False otherwise.
99-
100-
Raises
101-
------
102-
RuntimeError
103-
If the curl command returns an unexpected status code.
104-
"""
105-
proc = await asyncio.create_subprocess_exec(
106-
"curl",
107-
"-I",
108-
url,
109-
stdout=asyncio.subprocess.PIPE,
110-
stderr=asyncio.subprocess.PIPE,
111-
)
112-
stdout, _ = await proc.communicate()
113-
114-
if proc.returncode != 0:
115-
return False
116-
117-
response = stdout.decode()
118-
status_code = _get_status_code(response)
119-
120-
if status_code in (200, 301):
121-
return True
122-
123-
if status_code in (404, 302):
124-
return False
125-
126-
raise RuntimeError(f"Unexpected status code: {status_code}")
127-
128-
129-
async def fetch_remote_branch_list(url: str) -> List[str]:
130-
"""
131-
Fetch the list of branches from a remote Git repository.
132-
Parameters
133-
----------
134-
url : str
135-
The URL of the Git repository to fetch branches from.
136-
Returns
137-
-------
138-
List[str]
139-
A list of branch names available in the remote repository.
140-
"""
141-
fetch_branches_command = ["git", "ls-remote", "--heads", url]
142-
stdout, _ = await _run_command(*fetch_branches_command)
143-
stdout_decoded = stdout.decode()
144-
145-
return [
146-
line.split("refs/heads/", 1)[1]
147-
for line in stdout_decoded.splitlines()
148-
if line.strip() and "refs/heads/" in line
149-
]
150-
151-
152-
async def _run_command(*args: str) -> Tuple[bytes, bytes]:
153-
"""
154-
Execute a command asynchronously and captures its output.
155-
156-
Parameters
157-
----------
158-
*args : str
159-
The command and its arguments to execute.
160-
161-
Returns
162-
-------
163-
Tuple[bytes, bytes]
164-
A tuple containing the stdout and stderr of the command.
165-
166-
Raises
167-
------
168-
RuntimeError
169-
If command exits with a non-zero status.
170-
"""
171-
await check_git_installed()
172-
173-
# Execute the requested command
174-
proc = await asyncio.create_subprocess_exec(
175-
*args,
176-
stdout=asyncio.subprocess.PIPE,
177-
stderr=asyncio.subprocess.PIPE,
178-
)
179-
stdout, stderr = await proc.communicate()
180-
if proc.returncode != 0:
181-
error_message = stderr.decode().strip()
182-
raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}")
183-
184-
return stdout, stderr
185-
186-
187-
async def check_git_installed() -> None:
188-
"""
189-
Check if Git is installed and accessible on the system.
190-
191-
Raises
192-
------
193-
RuntimeError
194-
If Git is not installed or if the Git command exits with a non-zero status.
195-
"""
196-
try:
197-
proc = await asyncio.create_subprocess_exec(
198-
"git",
199-
"--version",
200-
stdout=asyncio.subprocess.PIPE,
201-
stderr=asyncio.subprocess.PIPE,
202-
)
203-
_, stderr = await proc.communicate()
204-
if proc.returncode != 0:
205-
error_message = stderr.decode().strip() if stderr else "Git command not found"
206-
raise RuntimeError(f"Git is not installed or not accessible: {error_message}")
207-
208-
except FileNotFoundError as exc:
209-
raise RuntimeError("Git is not installed. Please install Git before proceeding.") from exc
210-
211-
212-
def _get_status_code(response: str) -> int:
213-
"""
214-
Extract the status code from an HTTP response.
215-
216-
Parameters
217-
----------
218-
response : str
219-
The HTTP response string.
220-
221-
Returns
222-
-------
223-
int
224-
The status code of the response
225-
"""
226-
status_line = response.splitlines()[0].strip()
227-
status_code = int(status_line.split(" ", 2)[1])
228-
return status_code
85+
await run_command(*checkout_cmd)

src/gitingest/entrypoint.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import shutil
66
from typing import Optional, Set, Tuple, Union
77

8-
from gitingest.cloning import clone
8+
from gitingest.cloning import clone_repo
99
from gitingest.config import TMP_BASE_PATH
1010
from gitingest.ingestion import ingest_query
1111
from gitingest.query_parsing import IngestionQuery, parse_query
@@ -53,7 +53,7 @@ async def ingest_async(
5353
Raises
5454
------
5555
TypeError
56-
If `clone` does not return a coroutine, or if the `source` is of an unsupported type.
56+
If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type.
5757
"""
5858
repo_cloned = False
5959

@@ -71,15 +71,15 @@ async def ingest_async(
7171
query.branch = selected_branch
7272

7373
clone_config = query.extract_clone_config()
74-
clone_coroutine = clone(clone_config)
74+
clone_coroutine = clone_repo(clone_config)
7575

7676
if inspect.iscoroutine(clone_coroutine):
7777
if asyncio.get_event_loop().is_running():
7878
await clone_coroutine
7979
else:
8080
asyncio.run(clone_coroutine)
8181
else:
82-
raise TypeError("clone did not return a coroutine as expected.")
82+
raise TypeError("clone_repo did not return a coroutine as expected.")
8383

8484
repo_cloned = True
8585

src/gitingest/ingestion.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
from typing import Tuple
66

77
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
8-
from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats
98
from gitingest.output_formatters import format_node
109
from gitingest.query_parsing import IngestionQuery
10+
from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212
from gitingest.utils.path_utils import _is_safe_symlink
1313

src/gitingest/output_formatters.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
import tiktoken
66

7-
from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType
87
from gitingest.query_parsing import IngestionQuery
8+
from gitingest.schemas import FileSystemNode, FileSystemNodeType
99

1010

1111
def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]:

src/gitingest/query_parsing.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
from typing import List, Optional, Set, Union
88
from urllib.parse import unquote, urlparse
99

10-
from gitingest.cloning import _check_repo_exists, fetch_remote_branch_list
1110
from gitingest.config import TMP_BASE_PATH
12-
from gitingest.exceptions import InvalidPatternError
13-
from gitingest.ingestion_schema import IngestionQuery
11+
from gitingest.schemas import IngestionQuery
12+
from gitingest.utils.exceptions import InvalidPatternError
13+
from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branch_list
1414
from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS
1515
from gitingest.utils.query_parser_utils import (
1616
KNOWN_GIT_HOSTS,
@@ -308,6 +308,6 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
308308
"""
309309
for domain in KNOWN_GIT_HOSTS:
310310
candidate = f"https://{domain}/{user_name}/{repo_name}"
311-
if await _check_repo_exists(candidate):
311+
if await check_repo_exists(candidate):
312312
return domain
313313
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")

src/gitingest/schemas/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""This module contains the schemas for the Gitingest package."""
2+
3+
from gitingest.schemas.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats
4+
from gitingest.schemas.ingestion_schema import CloneConfig, IngestionQuery
5+
6+
__all__ = ["FileSystemNode", "FileSystemNodeType", "FileSystemStats", "CloneConfig", "IngestionQuery"]

src/gitingest/filesystem_schema.py renamed to src/gitingest/schemas/filesystem_schema.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@
77
from enum import Enum, auto
88
from pathlib import Path
99

10-
from gitingest.utils.ingestion_utils import _get_encoding_list
10+
from gitingest.utils.file_utils import get_preferred_encodings, is_text_file
1111
from gitingest.utils.notebook_utils import process_notebook
12-
from gitingest.utils.textfile_checker_utils import is_textfile
1312

1413
SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48
1514

@@ -117,7 +116,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
117116
if self.type == FileSystemNodeType.DIRECTORY:
118117
raise ValueError("Cannot read content of a directory node")
119118

120-
if not is_textfile(self.path):
119+
if not is_text_file(self.path):
121120
return "[Non-text file]"
122121

123122
if self.path.suffix == ".ipynb":
@@ -127,7 +126,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
127126
return f"Error processing notebook: {exc}"
128127

129128
# Try multiple encodings
130-
for encoding in _get_encoding_list():
129+
for encoding in get_preferred_encodings():
131130
try:
132131
with self.path.open(encoding=encoding) as f:
133132
return f.read()
File renamed without changes.

src/gitingest/utils/file_utils.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""Utility functions for working with files and directories."""
2+
3+
import locale
4+
import platform
5+
from pathlib import Path
6+
from typing import List
7+
8+
try:
9+
locale.setlocale(locale.LC_ALL, "")
10+
except locale.Error:
11+
locale.setlocale(locale.LC_ALL, "C")
12+
13+
14+
def get_preferred_encodings() -> List[str]:
15+
"""
16+
Get list of encodings to try, prioritized for the current platform.
17+
18+
Returns
19+
-------
20+
List[str]
21+
List of encoding names to try in priority order, starting with the
22+
platform's default encoding followed by common fallback encodings.
23+
"""
24+
encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"]
25+
if platform.system() == "Windows":
26+
encodings += ["cp1252", "iso-8859-1"]
27+
return encodings
28+
29+
30+
def is_text_file(path: Path) -> bool:
31+
"""
32+
Determine if the file is likely a text file by trying to decode a small chunk
33+
with multiple encodings, and checking for common binary markers.
34+
35+
Parameters
36+
----------
37+
path : Path
38+
The path to the file to check.
39+
40+
Returns
41+
-------
42+
bool
43+
True if the file is likely textual; False if it appears to be binary.
44+
"""
45+
46+
# Attempt to read a portion of the file in binary mode
47+
try:
48+
with path.open("rb") as f:
49+
chunk = f.read(1024)
50+
except OSError:
51+
return False
52+
53+
# If file is empty, treat as text
54+
if not chunk:
55+
return True
56+
57+
# Check obvious binary bytes
58+
if b"\x00" in chunk or b"\xff" in chunk:
59+
return False
60+
61+
# Attempt multiple encodings
62+
for enc in get_preferred_encodings():
63+
try:
64+
with path.open(encoding=enc) as f:
65+
f.read()
66+
return True
67+
except UnicodeDecodeError:
68+
continue
69+
except OSError:
70+
return False
71+
72+
return False

0 commit comments

Comments
 (0)