#!/usr/bin/env python3 import re import requests from bs4 import BeautifulSoup SEASON_FILES_DIRECTORY = "/home/viciouscirce/dox/project_mathemagicians/penny_dreadful/legality/legality_data/" def download_seasons_html(): page = requests.get("https://pennydreadfulmagic.com/seasons/") return BeautifulSoup(page.text, "html.parser") def is_season(tag): return tag.name == "section" and "class" in tag.attrs.keys() and "stats" in tag.attrs["class"] def get_season_number(tag): header = tag.findChild("h2") header_text = header.string season_number = re.search("[0-9]{1,2}", header_text).group(0) return season_number def get_season_cards_url(tag): list_items = tag.findChildren("li") legal_cards_item = list_items[8] season_cards_url = legal_cards_item.a.attrs["href"] season_cards_url = season_cards_url.replace("https", "http") return season_cards_url def get_seasons(soup): season_tags = soup.find_all(is_season) season_urls = {} for season_tag in season_tags: season_number = get_season_number(season_tag) season_cards_url = get_season_cards_url(season_tag) season_urls[season_number] = season_cards_url return season_urls def download_season_lists(seasons): for season_number, season_url in seasons.items(): cards_list = requests.get(season_url) seasons[season_number] = cards_list.text return seasons def write_season_lists(responses): for season_number, cards_list in responses.items(): filename = SEASON_FILES_DIRECTORY + "season_" + season_number + "_legal_cards.txt" with open(filename, "w") as cards_file: cards_file.write(cards_list) def main(): soup = download_seasons_html() seasons = get_seasons(soup) lists = download_season_lists(seasons) write_season_lists(lists) if __name__ == "__main__": main()