Comic Scrape

So last night I decided I wanted to archive some web comics I used to read religiously in a format that I could later manipulate into an easy to read format so I can catch up on a few years of missed material without clicking Next a ton of times, thus was born cscrape!

#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

def make_xml_file(comic_image, comic_date, comic_description):
    root = ET.Element("root")
    doc = ET.SubElement(root, "comic")

    ET.SubElement(doc, "image").text = comic_image
    ET.SubElement(doc, "date").text = comic_date
    ET.SubElement(doc, "description").text = comic_description

    tree = ET.ElementTree(root)
    filename = 'comics/'+comic_date+".xml"
    tree.write(filename)

start_url = 'url_goes_here'

url = start_url

while True:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    for link in soup.find_all('meta'):
        if link.get('property') == 'og:title':
            comic_title = link.get('content')
        if link.get('property') == 'og:image':
            comic_image = link.get('content')
        if link.get('property') == 'og:description':
            comic_description = link.get('content')
        if link.get('property') == 'og:site_name':
            comic_name = link.get('content')
        if link.get('property') == 'og:article:published_time':
            comic_date = link.get('content')
    filename_split = comic_image.split('/')
    comic_filename = filename_split[4]
    make_xml_file(comic_filename, comic_date, comic_description)
    r_img = requests.get(comic_image)
    with open('comics/'+comic_filename, "wb") as fh:
        fh.write(r.content)

    for link in soup.find_all(rel='next'):
        print link.get('href')
    if soup.find_all(rel='next'):
        comic_next = link.get('href')
        url = comic_next
    else:
        break

It spits out the images and an accompanying XML file in the comics/ directory so that later I can write something else to process them into an easy to digest format to view say on a tablet or mobile phone for comfortable reading while kicked back in my recliner. Unfortunately for right now its fairly specific to a certain comic however I hope to be able make the code a bit more flexible in the future to allow working with any comic and perhaps roll in the functionality to process the scraped data into easy to read formats.

Close Bitnami banner
Bitnami