djokester
diff --git a/‎README.md
+31-4 b/‎README.md
+31-4
diff --git a/‎groqeval/__init__.py
+3-1 b/‎groqeval/__init__.py
+3-1
diff --git a/‎groqeval/metrics/answer_relevance.py
+9-19 b/‎groqeval/metrics/answer_relevance.py
+9-19
diff --git a/‎groqeval/metrics/base_metric.py
+59-19 b/‎groqeval/metrics/base_metric.py
+59-19
diff --git a/‎groqeval/metrics/bias.py
+12-17 b/‎groqeval/metrics/bias.py
+12-17
diff --git a/‎groqeval/metrics/context_relevance.py
+9-16 b/‎groqeval/metrics/context_relevance.py
+9-16
@@ -1,7 +1,12 @@
 <h1 align="center">
   GroqEval.
-</h1>
 <br>
+  <a href="https://badge.fury.io/py/groqeval"><img src="https://badge.fury.io/py/groqeval.svg" alt="PyPI version" height="19"></a>
+  <a href="https://codecov.io/github/djokester/groqeval" height="18"> 
+  <img src="https://codecov.io/github/djokester/groqeval/graph/badge.svg?token=HS4K1Z7F3P"/> 
+  </a>
+  <img alt="GitHub Actions Workflow Status" src="https://img.shields.io/github/actions/workflow/status/djokester/groqeval/codecov.yml?branch=main&style=flat&label=Tests">
+</h1>
 
 ---
 
@@ -10,27 +15,49 @@ GroqEval is a powerful and easy-to-use evaluation framework designed specificall
 ## Getting Started
 
 Installation 
+To install GroqEval, simply use pip:
 ```bash
 pip install groqeval
 ```
 
 Initialising an evaluator.  
+To begin using GroqEval, you need to initialize an evaluator with your API key:
 ```python
 from groqeval import GroqEval
 evaluator = GroqEval(api_key=API_KEY)
 ```
-The evaluator is the central orchestrator that initializes the metrics. 
+The evaluator is the central component that orchestrates the initialization and execution of various metrics.
 
+You can create metric instances with the evaluator. Here's the default behavior:
 ```python
-from groqeval.evaluate import GroqEval
+# Default Behaviour
 metrics = evaluator(metric_name, **kwargs)
+
+# Verbosity Enabled
+metrics = evaluator(metric_name, verbose=True, **kwargs)
+```
+Three additional keyword arguments form the basis of evaluation: context, prompt, and output. Their usage varies by metric and is detailed in the respective sections for each metric.
+
+Once the metric class is initialized with the inputs, you can obtain the score by calling the score() function:
+```python
+metrics.score()
+```
+By default the `score` function uses a default aggregation function which is the average for relevance type metrics and max for metrics like bias and toxicity. You can pass a custom aggregation function to the score function. The custom function should accept a list of integers and return a float or integer value:
+```python
+from typing import List, Union
+
+def custom_function(scores: List[int]) -> Union[int, float]:
+    # Define your custom aggregation function. 
+
+metrics.score(aggregation = custom_function)
 ```
 
-To list all the available metrics
+To list all available metrics offered by GroqEval:
 ```python
 >>> evaluator.list_metrics()
 ['AnswerRelevance', 'Bias', 'ContextRelevance', 'Faithfulness', 'Hallucination', 'Toxicity']
 ```
+This section provides an overview of how to set up and use GroqEval. For detailed usage and calculation methods of each metric, refer to the respective metric sections below.
 
 ## Answer Relevance
 The Answer Relevance metric evaluates how accurately and closely the responses of a language model align with the specific query or prompt provided. This metric ensures that each part of the output, recognized as coherent statements, is scored for its relevance to the original question, helping to gauge the utility and appropriateness of the model's responses.
 
@@ -1 +1,3 @@
-from groqeval.evaluate import GroqEval
+from groqeval.evaluate import GroqEval
+
+__all__ = ["GroqEval"]
@@ -1,6 +1,7 @@
 # groqeval/metrics/answer_relevance.py
 import json
 from groq import Groq
+from cachetools import cached, TTLCache
 from groqeval.models.output import Output, ScoredOutput
 from groqeval.metrics.base_metric import BaseMetric
 
@@ -12,8 +13,8 @@ class AnswerRelevance(BaseMetric):
     relevance to the original question, helping to gauge the utility and appropriateness 
     of the model's responses.
     """
-    def __init__(self, groq_client: Groq, output: str, prompt: str):
-        super().__init__(groq_client)
+    def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
+        super().__init__(groq_client, kwargs.get('verbose'))
         self.output = output
         self.prompt = prompt
         self.check_data_types(prompt=prompt, output=output)
@@ -66,15 +67,16 @@ def output_decomposition(self):
             {"role": "system", "content": self.output_decomposition_prompt},
             {"role": "user", "content": self.output}
         ]
-        print(messages)
         response = self.groq_chat_completion(
             messages=messages,
             model="llama3-70b-8192",
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Decomposition of the Output into Statements: \n%s", response.choices[0].message.content)
         return Output.model_validate_json(response.choices[0].message.content)
 
+    @cached(cache=TTLCache(maxsize=100, ttl=300))
     def score_relevance(self):
         """
         Each identified statement is then scored on a scale from 1 (completely irrelevant) 
@@ -93,21 +95,9 @@ def score_relevance(self):
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Breakdown of the Answer Relevance Score: \n%s", response.choices[0].message.content)
         return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
 
-    def score(self):
-        """
-        Aggregation of individual scores and final result.
-        """
-        scored_output, output_dictionary = self.score_relevance()
-        if scored_output.scores:
-            average_score = sum([output.score for output in scored_output.scores]) / len(scored_output.scores)
-            return {
-                'score': average_score,
-                'score_breakdown': output_dictionary
-            }
-        else:
-            return {
-                'score': 0,  # Default to 0 if there are no sentences to score
-                'score_breakdown': output_dictionary
-            }
+    @property
+    def scoring_function(self):
+        return self.score_relevance
@@ -1,9 +1,27 @@
-class BaseMetric:
+import logging
+import statistics
+from abc import ABC,abstractmethod
+from groq import Groq
+
+class BaseMetric(ABC):
     """
     The Base Metric class.
     """
-    def __init__(self, groq_client):
+    def __init__(self, groq_client: Groq, verbose: bool = None):
         self.groq_client = groq_client
+        self.aggregation = statistics.mean
+        self.logger = logging.getLogger(__name__)
+        handler = logging.StreamHandler()  # Stream handler to output to the console
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+        self.logger.propagate = False
+        
+        if verbose:
+            self.logger.setLevel(logging.INFO)  # Set to DEBUG to see all levels of logs
+            self.logger.info("Verbose Mode is on.")
+        else:
+            self.logger.setLevel(logging.WARNING)
 
     def groq_chat_completion(self, messages, model, temperature=0.5, response_format=None):
         """
@@ -15,29 +33,51 @@ def groq_chat_completion(self, messages, model, temperature=0.5, response_format
             temperature=temperature,
             response_format=response_format
         )
-        print(chat_completion.choices[0].message.content)
         return chat_completion
 
     def check_data_types(self, **kwargs):
         """
         Checks for empty strings in the arguments
         """
         for key, value in kwargs.items():
-            if key != "context":
-                if value == "":
-                    raise ValueError(f"'{key}' cannot be an empty string.")
-                if not isinstance(value, str):
-                    raise TypeError(f"'{key}' must be a string")
-            else:
-                if len(value) == 0:
-                    raise ValueError(f"'{key}' cannot be an empty list.")
-                if not isinstance(value, list):
-                    raise TypeError(f"'{key}' must be a list of strings")
+            if key != "verbose":
+                if key != "context":
+                    if value == "":
+                        raise ValueError(f"'{key}' cannot be an empty string.")
+                    if not isinstance(value, str):
+                        raise TypeError(f"'{key}' must be a string")
                 else:
-                    if not all(isinstance(item, str) for item in value):
-                        raise TypeError(f"All items in '{key}' must be strings")
-
-
-
-    def score(self):
+                    if len(value) == 0:
+                        raise ValueError(f"'{key}' cannot be an empty list.")
+                    if not isinstance(value, list):
+                        raise TypeError(f"'{key}' must be a list of strings")
+                    else:
+                        if not all(isinstance(item, str) for item in value):
+                            raise TypeError(f"All items in '{key}' must be strings")
+                        
+    @property
+    @abstractmethod
+    def scoring_function(self):
+        """
+        This property should be implemented by each child class
+        """
         raise NotImplementedError("This method should be overridden by subclasses")
+
+    def score(self, aggregation = None):
+        """
+        Aggregation of individual scores and final result.
+        """
+        if aggregation is not None:
+            self.aggregation = aggregation
+        scored_output, output_dictionary = self.scoring_function()
+        if scored_output.scores:
+            average_score = self.aggregation([output.score for output in scored_output.scores])
+            return {
+                'score': average_score,
+                'score_breakdown': output_dictionary
+            }
+        else:
+            return {
+                'score': 0,  # Default to 0 if there are no sentences to score
+                'score_breakdown': output_dictionary
+            }
@@ -1,6 +1,7 @@
 # groqeval/metrics/bias.py
 import json
 from groq import Groq
+from cachetools import cached, TTLCache
 from groqeval.models.output import Output, ScoredOutput
 from groqeval.metrics.base_metric import BaseMetric
 
@@ -12,10 +13,12 @@ class Bias(BaseMetric):
     context-driven expressions. This metric ensures that responses maintain a level of 
     objectivity and are free from prejudiced or skewed perspectives.
     """
-    def __init__(self, groq_client: Groq, output: str, prompt: str):
-        super().__init__(groq_client)
+    def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
+        super().__init__(groq_client, kwargs.get('verbose'))
         self.output = output
         self.prompt = prompt
+        self.aggregation = max
+
         self.check_data_types(prompt=prompt, output=output)
 
     @property
@@ -70,15 +73,16 @@ def output_decomposition(self):
             {"role": "system", "content": self.output_decomposition_prompt},
             {"role": "user", "content": self.output}
         ]
-        print(messages)
         response = self.groq_chat_completion(
             messages=messages,
             model="llama3-70b-8192",
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Decomposition of the Output into Opinions: \n%s", response.choices[0].message.content)
         return Output.model_validate_json(response.choices[0].message.content)
 
+    @cached(cache=TTLCache(maxsize=100, ttl=300))
     def score_bias(self):
         """
         Each opinion in the output is scored on a scale from 1 (completely unbiased) 
@@ -97,18 +101,9 @@ def score_bias(self):
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Breakdown of the Bias Score: \n%s", response.choices[0].message.content)
         return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
-
-    def score(self):
-        scored_output, output_dictionary = self.score_bias()
-        if scored_output.scores:
-            average_score = max([output.score for output in scored_output.scores])
-            return {
-                'score': average_score,
-                'score_breakdown': output_dictionary
-            }
-        else:
-            return {
-                'score': 0,  # Default to 0 if there are no sentences to score
-                'score_breakdown': output_dictionary
-            }
+    
+    @property
+    def scoring_function(self):
+        return self.score_bias
@@ -2,6 +2,7 @@
 import json
 from typing import List
 from groq import Groq
+from cachetools import cached, TTLCache
 from groqeval.models.context import Context, ScoredContext
 from groqeval.metrics.base_metric import BaseMetric
 
@@ -13,8 +14,8 @@ class ContextRelevance(BaseMetric):
     to the generator is pertinent and likely to enhance the quality and 
     accuracy of the generated responses.
     """
-    def __init__(self, groq_client: Groq, context: List[str], prompt: str):
-        super().__init__(groq_client)
+    def __init__(self, groq_client: Groq, context: List[str], prompt: str, **kwargs):
+        super().__init__(groq_client, kwargs.get('verbose'))
         self.context = context
         self.prompt = prompt
         self.check_data_types(prompt=prompt, context=context)
@@ -79,15 +80,16 @@ def context_decomposition(self):
             {"role": "system", "content": self.context_decomposition_prompt},
             {"role": "user", "content": self.format_retrieved_context}
         ]
-        print(messages)
         response = self.groq_chat_completion(
             messages=messages,
             model="llama3-70b-8192",
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Decomposition of the Context into Statements: \n%s", response.choices[0].message.content)
         return Context.model_validate_json(response.choices[0].message.content)
 
+    @cached(cache=TTLCache(maxsize=100, ttl=300))
     def score_relevance(self):
         """
         Each statement of context is evaluated to determine if it can be 
@@ -110,18 +112,9 @@ def score_relevance(self):
             temperature=0,
             response_format={"type": "json_object"}
         )
+        self.logger.info("Breakdown of the Context Relevance Score: \n%s", response.choices[0].message.content)
         return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)
 
-    def score(self):
-        scored_context, output_dictionary = self.score_relevance()
-        if scored_context.scores:
-            average_score = sum([context.score for context in scored_context.scores]) / len(scored_context.scores)
-            return {
-                'score': average_score,
-                'score_breakdown': output_dictionary
-            }
-        else:
-            return {
-                'score': 0,  # Default to 0 if there are no sentences to score
-                'score_breakdown': output_dictionary
-            }
+    @property
+    def scoring_function(self):
+        return self.score_relevance