-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautomate_url_captioner.py
59 lines (48 loc) · 2.16 KB
/
automate_url_captioner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
from transformers import AutoProcessor, BlipForConditionalGeneration
# Load the pretrained processor and model
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# URL of the page to scrape
url = "https://en.wikipedia.org/wiki/Honda"
# Download the page
response = requests.get(url)
# Parse the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find all img elements
img_elements = soup.find_all('img')
# Open a file to write the captions
with open("captions.txt", "w") as caption_file:
# Iterate over each img element
for img_element in img_elements:
img_url = img_element.get('src')
# Skip if the image is an SVG or too small (likely an icon)
if 'svg' in img_url or '1x1' in img_url:
continue
# Correct the URL if it's malformed
if img_url.startswith('//'):
img_url = 'https:' + img_url
elif not img_url.startswith('http://') and not img_url.startswith('https://'):
continue # Skip URLs that don't start with http:// or https://
try:
# Download the image
response = requests.get(img_url)
# Convert the image data to a PIL Image
raw_image = Image.open(BytesIO(response.content))
if raw_image.size[0] * raw_image.size[1] < 400: # Skip very small images
continue
raw_image = raw_image.convert('RGB')
# Process the image
inputs = processor(raw_image, return_tensors="pt")
# Generate a caption for the image
out = model.generate(**inputs, max_new_tokens=50)
# Decode the generated tokens to text
caption = processor.decode(out[0], skip_special_tokens=True)
# Write the caption to the file, prepended by the image URL
caption_file.write(f"{img_url}: {caption}\n")
except Exception as e:
print(f"Error processing image {img_url}: {e}")
continue