Skip to content

Commit ae6c466

Browse files
authored
Merge pull request #9 from djokester/2-add-flags-for-aggregation-and-verbosity
#2 Add Flags for Aggregation and Verbosity
2 parents 9b92ca1 + 146da17 commit ae6c466

18 files changed

+224
-132
lines changed

README.md

+31-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
<h1 align="center">
22
GroqEval.
3-
</h1>
43
<br>
4+
<a href="https://badge.fury.io/py/groqeval"><img src="https://badge.fury.io/py/groqeval.svg" alt="PyPI version" height="19"></a>
5+
<a href="https://codecov.io/github/djokester/groqeval" height="18">
6+
<img src="https://codecov.io/github/djokester/groqeval/graph/badge.svg?token=HS4K1Z7F3P"/>
7+
</a>
8+
<img alt="GitHub Actions Workflow Status" src="https://img.shields.io/github/actions/workflow/status/djokester/groqeval/codecov.yml?branch=main&style=flat&label=Tests">
9+
</h1>
510

611
---
712

@@ -10,27 +15,49 @@ GroqEval is a powerful and easy-to-use evaluation framework designed specificall
1015
## Getting Started
1116

1217
Installation
18+
To install GroqEval, simply use pip:
1319
```bash
1420
pip install groqeval
1521
```
1622

1723
Initialising an evaluator.
24+
To begin using GroqEval, you need to initialize an evaluator with your API key:
1825
```python
1926
from groqeval import GroqEval
2027
evaluator = GroqEval(api_key=API_KEY)
2128
```
22-
The evaluator is the central orchestrator that initializes the metrics.
29+
The evaluator is the central component that orchestrates the initialization and execution of various metrics.
2330

31+
You can create metric instances with the evaluator. Here's the default behavior:
2432
```python
25-
from groqeval.evaluate import GroqEval
33+
# Default Behaviour
2634
metrics = evaluator(metric_name, **kwargs)
35+
36+
# Verbosity Enabled
37+
metrics = evaluator(metric_name, verbose=True, **kwargs)
38+
```
39+
Three additional keyword arguments form the basis of evaluation: context, prompt, and output. Their usage varies by metric and is detailed in the respective sections for each metric.
40+
41+
Once the metric class is initialized with the inputs, you can obtain the score by calling the score() function:
42+
```python
43+
metrics.score()
44+
```
45+
By default the `score` function uses a default aggregation function which is the average for relevance type metrics and max for metrics like bias and toxicity. You can pass a custom aggregation function to the score function. The custom function should accept a list of integers and return a float or integer value:
46+
```python
47+
from typing import List, Union
48+
49+
def custom_function(scores: List[int]) -> Union[int, float]:
50+
# Define your custom aggregation function.
51+
52+
metrics.score(aggregation = custom_function)
2753
```
2854

29-
To list all the available metrics
55+
To list all available metrics offered by GroqEval:
3056
```python
3157
>>> evaluator.list_metrics()
3258
['AnswerRelevance', 'Bias', 'ContextRelevance', 'Faithfulness', 'Hallucination', 'Toxicity']
3359
```
60+
This section provides an overview of how to set up and use GroqEval. For detailed usage and calculation methods of each metric, refer to the respective metric sections below.
3461

3562
## Answer Relevance
3663
The Answer Relevance metric evaluates how accurately and closely the responses of a language model align with the specific query or prompt provided. This metric ensures that each part of the output, recognized as coherent statements, is scored for its relevance to the original question, helping to gauge the utility and appropriateness of the model's responses.

groqeval/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
from groqeval.evaluate import GroqEval
1+
from groqeval.evaluate import GroqEval
2+
3+
__all__ = ["GroqEval"]

groqeval/metrics/answer_relevance.py

+9-19
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# groqeval/metrics/answer_relevance.py
22
import json
33
from groq import Groq
4+
from cachetools import cached, TTLCache
45
from groqeval.models.output import Output, ScoredOutput
56
from groqeval.metrics.base_metric import BaseMetric
67

@@ -12,8 +13,8 @@ class AnswerRelevance(BaseMetric):
1213
relevance to the original question, helping to gauge the utility and appropriateness
1314
of the model's responses.
1415
"""
15-
def __init__(self, groq_client: Groq, output: str, prompt: str):
16-
super().__init__(groq_client)
16+
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
17+
super().__init__(groq_client, kwargs.get('verbose'))
1718
self.output = output
1819
self.prompt = prompt
1920
self.check_data_types(prompt=prompt, output=output)
@@ -66,15 +67,16 @@ def output_decomposition(self):
6667
{"role": "system", "content": self.output_decomposition_prompt},
6768
{"role": "user", "content": self.output}
6869
]
69-
print(messages)
7070
response = self.groq_chat_completion(
7171
messages=messages,
7272
model="llama3-70b-8192",
7373
temperature=0,
7474
response_format={"type": "json_object"}
7575
)
76+
self.logger.info("Decomposition of the Output into Statements: \n%s", response.choices[0].message.content)
7677
return Output.model_validate_json(response.choices[0].message.content)
7778

79+
@cached(cache=TTLCache(maxsize=100, ttl=300))
7880
def score_relevance(self):
7981
"""
8082
Each identified statement is then scored on a scale from 1 (completely irrelevant)
@@ -93,21 +95,9 @@ def score_relevance(self):
9395
temperature=0,
9496
response_format={"type": "json_object"}
9597
)
98+
self.logger.info("Breakdown of the Answer Relevance Score: \n%s", response.choices[0].message.content)
9699
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
97100

98-
def score(self):
99-
"""
100-
Aggregation of individual scores and final result.
101-
"""
102-
scored_output, output_dictionary = self.score_relevance()
103-
if scored_output.scores:
104-
average_score = sum([output.score for output in scored_output.scores]) / len(scored_output.scores)
105-
return {
106-
'score': average_score,
107-
'score_breakdown': output_dictionary
108-
}
109-
else:
110-
return {
111-
'score': 0, # Default to 0 if there are no sentences to score
112-
'score_breakdown': output_dictionary
113-
}
101+
@property
102+
def scoring_function(self):
103+
return self.score_relevance

groqeval/metrics/base_metric.py

+59-19
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,27 @@
1-
class BaseMetric:
1+
import logging
2+
import statistics
3+
from abc import ABC,abstractmethod
4+
from groq import Groq
5+
6+
class BaseMetric(ABC):
27
"""
38
The Base Metric class.
49
"""
5-
def __init__(self, groq_client):
10+
def __init__(self, groq_client: Groq, verbose: bool = None):
611
self.groq_client = groq_client
12+
self.aggregation = statistics.mean
13+
self.logger = logging.getLogger(__name__)
14+
handler = logging.StreamHandler() # Stream handler to output to the console
15+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16+
handler.setFormatter(formatter)
17+
self.logger.addHandler(handler)
18+
self.logger.propagate = False
19+
20+
if verbose:
21+
self.logger.setLevel(logging.INFO) # Set to DEBUG to see all levels of logs
22+
self.logger.info("Verbose Mode is on.")
23+
else:
24+
self.logger.setLevel(logging.WARNING)
725

826
def groq_chat_completion(self, messages, model, temperature=0.5, response_format=None):
927
"""
@@ -15,29 +33,51 @@ def groq_chat_completion(self, messages, model, temperature=0.5, response_format
1533
temperature=temperature,
1634
response_format=response_format
1735
)
18-
print(chat_completion.choices[0].message.content)
1936
return chat_completion
2037

2138
def check_data_types(self, **kwargs):
2239
"""
2340
Checks for empty strings in the arguments
2441
"""
2542
for key, value in kwargs.items():
26-
if key != "context":
27-
if value == "":
28-
raise ValueError(f"'{key}' cannot be an empty string.")
29-
if not isinstance(value, str):
30-
raise TypeError(f"'{key}' must be a string")
31-
else:
32-
if len(value) == 0:
33-
raise ValueError(f"'{key}' cannot be an empty list.")
34-
if not isinstance(value, list):
35-
raise TypeError(f"'{key}' must be a list of strings")
43+
if key != "verbose":
44+
if key != "context":
45+
if value == "":
46+
raise ValueError(f"'{key}' cannot be an empty string.")
47+
if not isinstance(value, str):
48+
raise TypeError(f"'{key}' must be a string")
3649
else:
37-
if not all(isinstance(item, str) for item in value):
38-
raise TypeError(f"All items in '{key}' must be strings")
39-
40-
41-
42-
def score(self):
50+
if len(value) == 0:
51+
raise ValueError(f"'{key}' cannot be an empty list.")
52+
if not isinstance(value, list):
53+
raise TypeError(f"'{key}' must be a list of strings")
54+
else:
55+
if not all(isinstance(item, str) for item in value):
56+
raise TypeError(f"All items in '{key}' must be strings")
57+
58+
@property
59+
@abstractmethod
60+
def scoring_function(self):
61+
"""
62+
This property should be implemented by each child class
63+
"""
4364
raise NotImplementedError("This method should be overridden by subclasses")
65+
66+
def score(self, aggregation = None):
67+
"""
68+
Aggregation of individual scores and final result.
69+
"""
70+
if aggregation is not None:
71+
self.aggregation = aggregation
72+
scored_output, output_dictionary = self.scoring_function()
73+
if scored_output.scores:
74+
average_score = self.aggregation([output.score for output in scored_output.scores])
75+
return {
76+
'score': average_score,
77+
'score_breakdown': output_dictionary
78+
}
79+
else:
80+
return {
81+
'score': 0, # Default to 0 if there are no sentences to score
82+
'score_breakdown': output_dictionary
83+
}

groqeval/metrics/bias.py

+12-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# groqeval/metrics/bias.py
22
import json
33
from groq import Groq
4+
from cachetools import cached, TTLCache
45
from groqeval.models.output import Output, ScoredOutput
56
from groqeval.metrics.base_metric import BaseMetric
67

@@ -12,10 +13,12 @@ class Bias(BaseMetric):
1213
context-driven expressions. This metric ensures that responses maintain a level of
1314
objectivity and are free from prejudiced or skewed perspectives.
1415
"""
15-
def __init__(self, groq_client: Groq, output: str, prompt: str):
16-
super().__init__(groq_client)
16+
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
17+
super().__init__(groq_client, kwargs.get('verbose'))
1718
self.output = output
1819
self.prompt = prompt
20+
self.aggregation = max
21+
1922
self.check_data_types(prompt=prompt, output=output)
2023

2124
@property
@@ -70,15 +73,16 @@ def output_decomposition(self):
7073
{"role": "system", "content": self.output_decomposition_prompt},
7174
{"role": "user", "content": self.output}
7275
]
73-
print(messages)
7476
response = self.groq_chat_completion(
7577
messages=messages,
7678
model="llama3-70b-8192",
7779
temperature=0,
7880
response_format={"type": "json_object"}
7981
)
82+
self.logger.info("Decomposition of the Output into Opinions: \n%s", response.choices[0].message.content)
8083
return Output.model_validate_json(response.choices[0].message.content)
8184

85+
@cached(cache=TTLCache(maxsize=100, ttl=300))
8286
def score_bias(self):
8387
"""
8488
Each opinion in the output is scored on a scale from 1 (completely unbiased)
@@ -97,18 +101,9 @@ def score_bias(self):
97101
temperature=0,
98102
response_format={"type": "json_object"}
99103
)
104+
self.logger.info("Breakdown of the Bias Score: \n%s", response.choices[0].message.content)
100105
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
101-
102-
def score(self):
103-
scored_output, output_dictionary = self.score_bias()
104-
if scored_output.scores:
105-
average_score = max([output.score for output in scored_output.scores])
106-
return {
107-
'score': average_score,
108-
'score_breakdown': output_dictionary
109-
}
110-
else:
111-
return {
112-
'score': 0, # Default to 0 if there are no sentences to score
113-
'score_breakdown': output_dictionary
114-
}
106+
107+
@property
108+
def scoring_function(self):
109+
return self.score_bias

groqeval/metrics/context_relevance.py

+9-16
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
from typing import List
44
from groq import Groq
5+
from cachetools import cached, TTLCache
56
from groqeval.models.context import Context, ScoredContext
67
from groqeval.metrics.base_metric import BaseMetric
78

@@ -13,8 +14,8 @@ class ContextRelevance(BaseMetric):
1314
to the generator is pertinent and likely to enhance the quality and
1415
accuracy of the generated responses.
1516
"""
16-
def __init__(self, groq_client: Groq, context: List[str], prompt: str):
17-
super().__init__(groq_client)
17+
def __init__(self, groq_client: Groq, context: List[str], prompt: str, **kwargs):
18+
super().__init__(groq_client, kwargs.get('verbose'))
1819
self.context = context
1920
self.prompt = prompt
2021
self.check_data_types(prompt=prompt, context=context)
@@ -79,15 +80,16 @@ def context_decomposition(self):
7980
{"role": "system", "content": self.context_decomposition_prompt},
8081
{"role": "user", "content": self.format_retrieved_context}
8182
]
82-
print(messages)
8383
response = self.groq_chat_completion(
8484
messages=messages,
8585
model="llama3-70b-8192",
8686
temperature=0,
8787
response_format={"type": "json_object"}
8888
)
89+
self.logger.info("Decomposition of the Context into Statements: \n%s", response.choices[0].message.content)
8990
return Context.model_validate_json(response.choices[0].message.content)
9091

92+
@cached(cache=TTLCache(maxsize=100, ttl=300))
9193
def score_relevance(self):
9294
"""
9395
Each statement of context is evaluated to determine if it can be
@@ -110,18 +112,9 @@ def score_relevance(self):
110112
temperature=0,
111113
response_format={"type": "json_object"}
112114
)
115+
self.logger.info("Breakdown of the Context Relevance Score: \n%s", response.choices[0].message.content)
113116
return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
114117

115-
def score(self):
116-
scored_context, output_dictionary = self.score_relevance()
117-
if scored_context.scores:
118-
average_score = sum([context.score for context in scored_context.scores]) / len(scored_context.scores)
119-
return {
120-
'score': average_score,
121-
'score_breakdown': output_dictionary
122-
}
123-
else:
124-
return {
125-
'score': 0, # Default to 0 if there are no sentences to score
126-
'score_breakdown': output_dictionary
127-
}
118+
@property
119+
def scoring_function(self):
120+
return self.score_relevance

0 commit comments

Comments
 (0)