Skip to content

Commit aaeaea7

Browse files
authored
Merge pull request #5829 from uktrade/feature/fetch-emails-from-exchange
Email ingestion from Exchange.
2 parents 02bd991 + 41ef911 commit aaeaea7

File tree

4 files changed

+284
-99
lines changed

4 files changed

+284
-99
lines changed

README.md

+6-5
Original file line numberDiff line numberDiff line change
@@ -450,12 +450,13 @@ Data Hub API can run on any Heroku-style platform. Configuration is performed vi
450450
| `EXPORT_WIN_LEAD_OFFICER_REJECTED_TEMPLATE_ID` | Yes | An ID of Notify Template for Export Win Lead Officer Rejected notifications |
451451
| `TASK_REMINDER_EMAIL_TEMPLATE_ID` | No | An ID of Notify Template for the generic Task reminder notifications |
452452
| `TASK_NOTIFICATION_FROM_OTHERS_TEMPLATE_ID` | Yes | An ID of Notify Template for Task assigned by others notifications |
453-
| `MAILBOX_AWS_ACCESS_KEY_ID` | No | Same use as AWS_ACCESS_KEY_ID, but for mailbox. |
454-
| `MAILBOX_AWS_SECRET_ACCESS_KEY` | No | Same use as AWS_SECRET_ACCESS_KEY, but for mailbox. |
455-
| `MAILBOX_AWS_REGION` | No | Same use as AWS_DEFAULT_REGION, but for mailbox. |
456-
| `MAILBOX_BUCKET` | No | S3 bucket for mailbox storage. |
457-
| `MAILBOX_INGESTION_SUCCESS_TEMPLATE_ID` | No | An ID of Notify Template for mailbox ingestion success |
453+
| `MAILBOX_INGESTION_CLIENT_ID` | No | An OAuth Client ID for Email Ingestion Exchange Server |
454+
| `MAILBOX_INGESTION_CLIENT_SECRET` | No | An OAuth Client Secret for Email Ingestion Exchange Server |
455+
| `MAILBOX_INGESTION_EMAIL` | No | The email address for Email Ingestion Exchange Server |
456+
| `MAILBOX_INGESTION_GRAPH_URL` | No | Graph API URL for Email Ingestion Exchange Server |
457+
| `MAILBOX_INGESTION_TENANT_ID` | No | A Tenant ID for Email Ingestion Exchange Server |
458458
| `MAILBOX_INGESTION_FAILURE_TEMPLATE_ID` | No | An ID of Notify Template for mailbox ingestion failure |
459+
| `MAILBOX_INGESTION_SUCCESS_TEMPLATE_ID` | No | An ID of Notify Template for mailbox ingestion success |
459460
| `MARKET_ACCESS_ACCESS_KEY_ID` | No | A non-secret access key ID used by the Market Access service to access Hawk-authenticated public company endpoints. |
460461
| `MARKET_ACCESS_SECRET_ACCESS_KEY` | If `MARKET_ACCESS_ACCESS_KEY_ID` is set | A secret key used by the Market Access service to access Hawk-authenticated public company endpoints. |
461462
| `NOTIFICATION_SUMMARY_THRESHOLD` | No | Number of notification items that trigger sending a summary email. (default=5) |

config/settings/common.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,26 @@ def _build_redis_url(base_url, db_number, **query_args):
537537
default='',
538538
)
539539

540+
MAILBOX_INGESTION_CLIENT_ID = env(
541+
'MAILBOX_INGESTION_CLIENT_ID',
542+
default='',
543+
)
544+
MAILBOX_INGESTION_CLIENT_SECRET = env(
545+
'MAILBOX_INGESTION_CLIENT_SECRET',
546+
default='',
547+
)
548+
MAILBOX_INGESTION_TENANT_ID = env(
549+
'MAILBOX_INGESTION_TENANT_ID',
550+
default='',
551+
)
552+
MAILBOX_INGESTION_GRAPH_URL = env(
553+
'MAILBOX_INGESTION_GRAPH_URL',
554+
default='https://graph.microsoft.com/v1.0/',
555+
)
556+
MAILBOX_INGESTION_EMAIL = env(
557+
'MAILBOX_INGESTION_GRAPH_URL',
558+
default='',
559+
)
540560

541561
# GOV.UK PAY
542562
GOVUK_PAY_URL = env('GOVUK_PAY_URL', default='')
@@ -641,12 +661,6 @@ def _add_hawk_credentials(id_env_name, key_env_name, scopes):
641661
'aws_secret_access_key': env('REPORT_AWS_SECRET_ACCESS_KEY', default=''),
642662
'aws_region': env('REPORT_AWS_REGION', default=''),
643663
},
644-
'mailbox': {
645-
'bucket': env('MAILBOX_BUCKET', default=''),
646-
'aws_access_key_id': env('MAILBOX_AWS_ACCESS_KEY_ID', default=''),
647-
'aws_secret_access_key': env('MAILBOX_AWS_SECRET_ACCESS_KEY', default=''),
648-
'aws_region': env('MAILBOX_AWS_REGION', default=''),
649-
},
650664
}
651665

652666
DIT_EMAIL_INGEST_BLOCKLIST = [

datahub/email_ingestion/emails.py

+78-44
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,124 @@
1-
import tempfile
21
from logging import getLogger
32

43
import mailparser
4+
import requests
55
from django.conf import settings
6-
from django.core.exceptions import ImproperlyConfigured
76
from django.utils.timezone import now
7+
from rest_framework import status
88

9-
from datahub.documents import utils as documents
109
from datahub.email_ingestion.models import MailboxLogging, MailboxProcessingStatus
11-
from datahub.interaction.email_processors.processors import CalendarInteractionEmailProcessor
10+
from datahub.interaction.email_processors.processors import InteractionPlainEmailProcessor
11+
1212

1313
logger = getLogger(__name__)
1414

1515
BUCKET_ID = 'mailbox'
1616

1717

18-
def get_mail_docs_in_bucket():
19-
"""
20-
Gets all mail documents in the bucket.
21-
"""
22-
if BUCKET_ID not in settings.DOCUMENT_BUCKETS:
23-
raise ImproperlyConfigured(f'Bucket "{BUCKET_ID}" is missing in settings')
18+
def _get_headers(token):
19+
return {
20+
'Authorization': f'Bearer {token}',
21+
}
2422

25-
config = settings.DOCUMENT_BUCKETS[BUCKET_ID]
26-
if 'bucket' not in config:
27-
raise ImproperlyConfigured(f'Bucket "{BUCKET_ID}" not configured properly in settings')
2823

29-
name = config['bucket']
30-
if not name:
31-
raise ImproperlyConfigured(
32-
f'Bucket "{BUCKET_ID}" bucket value not configured properly in settings',
33-
)
24+
def _get_base_url():
25+
user_email = settings.MAILBOX_INGESTION_EMAIL
26+
return f'{settings.MAILBOX_INGESTION_GRAPH_URL}users/{user_email}'
27+
28+
29+
def get_access_token(tenant_id, client_id, client_secret):
30+
token_url = f'https://login.microsoftonline.com/{tenant_id}/oauth/v2.0/token'
31+
token_data = {
32+
'grant_type': 'client_credentials',
33+
'client_id': client_id,
34+
'client_secret': client_secret,
35+
'scope': 'https://graph.microsoft.com/.default',
36+
}
37+
token_request = requests.post(token_url, data=token_data)
38+
return token_request.json().get('access_token')
39+
40+
41+
def read_messages(token):
42+
base_url = _get_base_url()
43+
messages_url = f'{base_url}/mailFolders/Inbox/messages'
44+
45+
messages_request = requests.get(
46+
messages_url,
47+
headers=_get_headers(token),
48+
)
49+
messages = messages_request.json().get('value', [])
50+
return messages
51+
3452

35-
client = documents.get_s3_client_for_bucket(bucket_id=BUCKET_ID)
53+
def fetch_message(token, message_id):
54+
base_url = _get_base_url()
55+
content_url = f'{base_url}/messages/{message_id}/$value'
3656

37-
paginator = client.get_paginator('list_objects')
38-
for page in paginator.paginate(Bucket=name):
39-
for doc in page.get('Contents') or []:
40-
key = doc['Key']
41-
with tempfile.TemporaryFile(mode='w+b') as f:
42-
client.download_fileobj(Bucket=name, Key=key, Fileobj=f)
43-
f.seek(0)
44-
content = f.read()
45-
yield {'source': key, 'content': content}
57+
content_request = requests.get(content_url, headers=_get_headers(token))
58+
if content_request.status_code == status.HTTP_200_OK:
59+
content = content_request.text
60+
return content
61+
62+
return None
63+
64+
65+
def delete_message(token, message_id):
66+
base_url = _get_base_url()
67+
delete_path = '/mailFolders/Inbox/messages/'
68+
delete_url = f'{base_url}{delete_path}{message_id}'
69+
70+
delete_request = requests.delete(delete_url, headers=_get_headers(token))
71+
return delete_request.status_code == status.HTTP_204_NO_CONTENT
4672

4773

4874
def process_ingestion_emails():
4975
"""
5076
Gets all new mail documents in the bucket and process each message.
5177
"""
52-
processor = CalendarInteractionEmailProcessor()
78+
processor = InteractionPlainEmailProcessor()
5379

54-
for message in get_mail_docs_in_bucket():
55-
source = message['source']
56-
try:
57-
documents.delete_document(bucket_id=BUCKET_ID, document_key=message['source'])
58-
except Exception as e:
59-
logger.exception('Error deleting message: "%s", error: "%s"', source, e)
80+
token = get_access_token(
81+
settings.MAILBOX_INGESTION_TENANT_ID,
82+
settings.MAILBOX_INGESTION_CLIENT_ID,
83+
settings.MAILBOX_INGESTION_CLIENT_SECRET,
84+
)
85+
86+
for message in read_messages(token):
87+
message_id = message['id']
88+
89+
content = fetch_message(token, message_id)
90+
if not content:
91+
logger.error('Error fetching message: "%s"', message_id)
92+
continue
93+
if not delete_message(token, message_id):
94+
logger.error('Error deleting message: "%s"', message_id)
6095
continue
6196

6297
try:
63-
log = _create_log_entry(source, message)
98+
log = _create_log_entry(message_id, message, content)
6499

65-
email = mailparser.parse_from_bytes(message['content'])
100+
email = mailparser.parse_from_string(content)
66101
processed, reason, interaction_id = processor.process_email(message=email)
67102
if not processed:
68103
_update_log_status(log, MailboxProcessingStatus.FAILURE, reason, None)
69-
logger.error('Error parsing message: "%s", error: "%s"', source, reason)
104+
logger.error('Error parsing message: "%s", error: "%s"', message_id, reason)
70105
else:
71106
_update_log_status(log, MailboxProcessingStatus.PROCESSED, reason, interaction_id)
72107
logger.info(reason)
73108
except Exception as e:
74109
_update_log_status(log, MailboxProcessingStatus.FAILURE, repr(e), None)
75-
logger.exception('Error processing message: "%s", error: "%s"', source, e)
110+
logger.exception('Error processing message: "%s", error: "%s"', message_id, e)
76111

77112
logger.info(
78-
'Successfully processed message "%s" and deleted document from bucket "%s"',
79-
source,
80-
BUCKET_ID,
113+
'Successfully processed message "%s" and deleted it from mailbox.',
114+
message_id,
81115
)
82116

83117

84-
def _create_log_entry(source, message):
118+
def _create_log_entry(source, message, content):
85119
log = MailboxLogging(
86120
retrieved_on=now(),
87-
content=message['content'].decode('utf-8'),
121+
content=content,
88122
source=source,
89123
status=MailboxProcessingStatus.RETRIEVED,
90124
)

0 commit comments

Comments
 (0)