Skip to content

Commit 21dae65

Browse files
committed
Added Verbosity as an argument
1 parent 1ff8201 commit 21dae65

12 files changed

+65
-40
lines changed

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ evaluator = GroqEval(api_key=API_KEY)
2727
The evaluator is the central orchestrator that initializes the metrics.
2828

2929
```python
30-
from groqeval.evaluate import GroqEval
3130
metrics = evaluator(metric_name, **kwargs)
3231
```
3332

groqeval/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
from groqeval.evaluate import GroqEval
1+
from groqeval.evaluate import GroqEval
2+
3+
__all__ = ["GroqEval"]

groqeval/metrics/answer_relevance.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class AnswerRelevance(BaseMetric):
1212
relevance to the original question, helping to gauge the utility and appropriateness
1313
of the model's responses.
1414
"""
15-
def __init__(self, groq_client: Groq, output: str, prompt: str):
16-
super().__init__(groq_client)
15+
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
16+
super().__init__(groq_client, kwargs.get('verbose'))
1717
self.output = output
1818
self.prompt = prompt
1919
self.check_data_types(prompt=prompt, output=output)
@@ -66,13 +66,13 @@ def output_decomposition(self):
6666
{"role": "system", "content": self.output_decomposition_prompt},
6767
{"role": "user", "content": self.output}
6868
]
69-
print(messages)
7069
response = self.groq_chat_completion(
7170
messages=messages,
7271
model="llama3-70b-8192",
7372
temperature=0,
7473
response_format={"type": "json_object"}
7574
)
75+
self.logger.info("Decomposition of the Output into Statements: %s", response.choices[0].message.content)
7676
return Output.model_validate_json(response.choices[0].message.content)
7777

7878
def score_relevance(self):
@@ -93,6 +93,7 @@ def score_relevance(self):
9393
temperature=0,
9494
response_format={"type": "json_object"}
9595
)
96+
self.logger.info("Breakdown of the Answer Relevance Score: %s", response.choices[0].message.content)
9697
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
9798

9899
def score(self):

groqeval/metrics/base_metric.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
1+
import logging
2+
from groq import Groq
3+
14
class BaseMetric:
25
"""
36
The Base Metric class.
47
"""
5-
def __init__(self, groq_client):
8+
def __init__(self, groq_client: Groq, verbose: bool = None):
69
self.groq_client = groq_client
10+
self.logger = logging.getLogger(__name__)
11+
if verbose:
12+
self.logger.setLevel(logging.INFO)
13+
714

815
def groq_chat_completion(self, messages, model, temperature=0.5, response_format=None):
916
"""
@@ -15,27 +22,27 @@ def groq_chat_completion(self, messages, model, temperature=0.5, response_format
1522
temperature=temperature,
1623
response_format=response_format
1724
)
18-
print(chat_completion.choices[0].message.content)
1925
return chat_completion
2026

2127
def check_data_types(self, **kwargs):
2228
"""
2329
Checks for empty strings in the arguments
2430
"""
2531
for key, value in kwargs.items():
26-
if key != "context":
27-
if value == "":
28-
raise ValueError(f"'{key}' cannot be an empty string.")
29-
if not isinstance(value, str):
30-
raise TypeError(f"'{key}' must be a string")
31-
else:
32-
if len(value) == 0:
33-
raise ValueError(f"'{key}' cannot be an empty list.")
34-
if not isinstance(value, list):
35-
raise TypeError(f"'{key}' must be a list of strings")
32+
if key != "verbose":
33+
if key != "context":
34+
if value == "":
35+
raise ValueError(f"'{key}' cannot be an empty string.")
36+
if not isinstance(value, str):
37+
raise TypeError(f"'{key}' must be a string")
3638
else:
37-
if not all(isinstance(item, str) for item in value):
38-
raise TypeError(f"All items in '{key}' must be strings")
39+
if len(value) == 0:
40+
raise ValueError(f"'{key}' cannot be an empty list.")
41+
if not isinstance(value, list):
42+
raise TypeError(f"'{key}' must be a list of strings")
43+
else:
44+
if not all(isinstance(item, str) for item in value):
45+
raise TypeError(f"All items in '{key}' must be strings")
3946

4047

4148

groqeval/metrics/bias.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class Bias(BaseMetric):
1212
context-driven expressions. This metric ensures that responses maintain a level of
1313
objectivity and are free from prejudiced or skewed perspectives.
1414
"""
15-
def __init__(self, groq_client: Groq, output: str, prompt: str):
16-
super().__init__(groq_client)
15+
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
16+
super().__init__(groq_client, kwargs.get('verbose'))
1717
self.output = output
1818
self.prompt = prompt
1919
self.check_data_types(prompt=prompt, output=output)
@@ -70,13 +70,13 @@ def output_decomposition(self):
7070
{"role": "system", "content": self.output_decomposition_prompt},
7171
{"role": "user", "content": self.output}
7272
]
73-
print(messages)
7473
response = self.groq_chat_completion(
7574
messages=messages,
7675
model="llama3-70b-8192",
7776
temperature=0,
7877
response_format={"type": "json_object"}
7978
)
79+
self.logger.info("Decomposition of the Output into Opinions: %s", response.choices[0].message.content)
8080
return Output.model_validate_json(response.choices[0].message.content)
8181

8282
def score_bias(self):
@@ -97,6 +97,7 @@ def score_bias(self):
9797
temperature=0,
9898
response_format={"type": "json_object"}
9999
)
100+
self.logger.info("Breakdown of the Bias Score: %s", response.choices[0].message.content)
100101
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
101102

102103
def score(self):

groqeval/metrics/context_relevance.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class ContextRelevance(BaseMetric):
1313
to the generator is pertinent and likely to enhance the quality and
1414
accuracy of the generated responses.
1515
"""
16-
def __init__(self, groq_client: Groq, context: List[str], prompt: str):
17-
super().__init__(groq_client)
16+
def __init__(self, groq_client: Groq, context: List[str], prompt: str, **kwargs):
17+
super().__init__(groq_client, kwargs.get('verbose'))
1818
self.context = context
1919
self.prompt = prompt
2020
self.check_data_types(prompt=prompt, context=context)
@@ -79,13 +79,13 @@ def context_decomposition(self):
7979
{"role": "system", "content": self.context_decomposition_prompt},
8080
{"role": "user", "content": self.format_retrieved_context}
8181
]
82-
print(messages)
8382
response = self.groq_chat_completion(
8483
messages=messages,
8584
model="llama3-70b-8192",
8685
temperature=0,
8786
response_format={"type": "json_object"}
8887
)
88+
self.logger.info("Decomposition of the Context into Statements: %s", response.choices[0].message.content)
8989
return Context.model_validate_json(response.choices[0].message.content)
9090

9191
def score_relevance(self):
@@ -110,6 +110,7 @@ def score_relevance(self):
110110
temperature=0,
111111
response_format={"type": "json_object"}
112112
)
113+
self.logger.info("Breakdown of the Context Relevance Score: %s", response.choices[0].message.content)
113114
return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
114115

115116
def score(self):

groqeval/metrics/faithfulness.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class Faithfulness(BaseMetric):
1212
content is not only relevant but also accurate and truthful with respect to the given context,
1313
critical for maintaining the integrity and reliability of the model's responses.
1414
"""
15-
def __init__(self, groq_client: Groq, context: List[str], output: str):
16-
super().__init__(groq_client)
15+
def __init__(self, groq_client: Groq, context: List[str], output: str, **kwargs):
16+
super().__init__(groq_client, kwargs.get('verbose'))
1717
self.context = context
1818
self.output = output
1919
self.check_data_types(context=context, output=output)
@@ -80,13 +80,13 @@ def output_decomposition(self):
8080
{"role": "system", "content": self.output_decomposition_prompt},
8181
{"role": "user", "content": self.output}
8282
]
83-
print(messages)
8483
response = self.groq_chat_completion(
8584
messages=messages,
8685
model="llama3-70b-8192",
8786
temperature=0,
8887
response_format={"type": "json_object"}
8988
)
89+
self.logger.info("Decomposition of the Output into Claims: %s", response.choices[0].message.content)
9090
return Output.model_validate_json(response.choices[0].message.content)
9191

9292
def score_faithfulness(self):
@@ -106,13 +106,13 @@ def score_faithfulness(self):
106106
{"role": "system", "content": self.faithfulness_prompt},
107107
{"role": "user", "content": json.dumps({"sentences": [s.string for s in coherent_sentences]}, indent=2)}
108108
]
109-
print(messages)
110109
response = self.groq_chat_completion(
111110
messages=messages,
112111
model="llama3-70b-8192",
113112
temperature=0,
114113
response_format={"type": "json_object"}
115114
)
115+
self.logger.info("Breakdown of the Faithfulness Score: %s", response.choices[0].message.content)
116116
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
117117

118118
def score(self):

groqeval/metrics/hallucination.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class Hallucination(BaseMetric):
1313
This is crucial for ensuring that the generated outputs remain grounded in the provided
1414
context and do not mislead or introduce inaccuracies.
1515
"""
16-
def __init__(self, groq_client: Groq, context: List[str], output: str):
17-
super().__init__(groq_client)
16+
def __init__(self, groq_client: Groq, context: List[str], output: str, **kwargs):
17+
super().__init__(groq_client, kwargs.get('verbose'))
1818
self.context = context
1919
self.output = output
2020
self.check_data_types(context=context, output=output)
@@ -89,13 +89,13 @@ def context_decomposition(self):
8989
{"role": "system", "content": self.context_decomposition_prompt},
9090
{"role": "user", "content": self.format_retrieved_context}
9191
]
92-
print(messages)
9392
response = self.groq_chat_completion(
9493
messages=messages,
9594
model="llama3-70b-8192",
9695
temperature=0,
9796
response_format={"type": "json_object"}
9897
)
98+
self.logger.info("Decomposition of the Context into Statements: %s", response.choices[0].message.content)
9999
return Context.model_validate_json(response.choices[0].message.content)
100100

101101
def score_hallucination(self):
@@ -116,6 +116,7 @@ def score_hallucination(self):
116116
temperature=0,
117117
response_format={"type": "json_object"}
118118
)
119+
self.logger.info("Breakdown of the Hallucination Score: %s", response.choices[0].message.content)
119120
return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
120121

121122
def score(self):

groqeval/metrics/toxicity.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class Toxicity(BaseMetric):
1212
wider consumption, identifying any language that could be considered
1313
insulting, aggressive, or otherwise damaging.
1414
"""
15-
def __init__(self, groq_client: Groq, output: str, prompt: str):
16-
super().__init__(groq_client)
15+
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
16+
super().__init__(groq_client, kwargs.get('verbose'))
1717
self.output = output
1818
self.prompt = prompt
1919
self.check_data_types(prompt=prompt, output=output)
@@ -69,13 +69,13 @@ def output_decomposition(self):
6969
{"role": "system", "content": self.output_decomposition_prompt},
7070
{"role": "user", "content": self.output}
7171
]
72-
print(messages)
7372
response = self.groq_chat_completion(
7473
messages=messages,
7574
model="llama3-70b-8192",
7675
temperature=0,
7776
response_format={"type": "json_object"}
7877
)
78+
self.logger.info("Breakdown of the Toxicity Score: %s", response.choices[0].message.content)
7979
return Output.model_validate_json(response.choices[0].message.content)
8080

8181
def score_toxicity(self):

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ readme = "README.md"
2121
requires-python = ">=3.10"
2222

2323
dependencies = [
24-
"groq==0.9.0"
24+
"groq>=0.9.0",
25+
"pydantic>=2.7.4"
2526
]
2627

2728
[tool.twine]

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
groq==0.9.0
1+
groq==0.9.0
2+
pydantic==2.7.4

tests/test_evaluate.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
import importlib
3-
from typing import List, Dict
3+
import pytest
44
from conftest import get_class_args, generate_random_value
55

66
def metricize(file_name: str):
@@ -26,5 +26,16 @@ def test_load_metrics(evaluator, metrics_folder, metrics_module):
2626
class_ = getattr(module, class_name)
2727
class_args = get_class_args(class_)
2828
random_args = {name: generate_random_value(param) for name, param in class_args.items()}
29-
print(class_name, random_args)
30-
assert type(evaluator(module_name, **random_args)) == class_
29+
assert type(evaluator(module_name, **random_args)) == class_
30+
31+
def test_load_base_metric(evaluator, metrics_module):
32+
module_name = "base_metric"
33+
module_path = f'{metrics_module}.{"base_metric"}'
34+
module = importlib.import_module(module_path)
35+
class_name = metricize(module_name)
36+
37+
class_ = getattr(module, class_name)
38+
class_args = get_class_args(class_)
39+
random_args = {name: generate_random_value(param) for name, param in class_args.items()}
40+
with pytest.raises(TypeError, match=f"{class_name} is not a valid metric class"):
41+
base_metric = evaluator(module_name, **random_args)

0 commit comments

Comments
 (0)