# How to use: # 1. Make a directory called `active` # 2. Define this shell function: # addfic() { mkdir active/$1 && echo $2 >active/$1/fimfiction_id.txt; } # 3. Add stories `addfic TheTitle 12345` (where 12345 is the story ID) # 4. Run this script from the directory containing `active` # # On startup, and every 3 hours, this script will scan for files named # `active/*/fimfiction_id.txt`, and check for updates on all the stories it # finds that way. In each `active/*/` directory, it will create files named # `metadata.json` and `Chapter_???.html`. These are symlinks to the latest # versions of the story metadata and the chapter HTML. Old versions are never # deleted. # # On startup, the script checks every story for updates. On 3-hour intervals, # it checks a random subset of stories, based on what's updated recently. A # story that was last updated N weeks ago will be checked with probability 1/N. # A story that updates weekly will always be checked, and you'll see new # chapters immediately. A story that updates monthly will be checked 1/4 of # the time, so with 8 checks per day, you'll normally see new chapters within a # day. # # Each time a story changes, the script produces output like this, showing what # fields have changed: # ThroughTheAurora (436202): 1 new chapter # ThroughTheAurora (436202): word count changed: 160429 -> 162551 (+2122) # ThroughTheAurora (436202): status changed: Incomplete -> Complete # ThroughTheAurora (436202): fetching chapter 53 (1388734) # Similar output appears on the first check for newly added stories. import datetime import json import os import random import re import shutil import sys import time from bs4 import BeautifulSoup import lxml # not used directly, but bs4 needs it installed import requests VERSION = 2 CHECK_INTERVAL = 3 * 3600 CHAPTER_DELAY = 1 STORY_DELAY = 10 USER_AGENT = 'curl/7.38.0, fimfic story monitor v%d' % VERSION def fetch(url): r = requests.get(url, headers={ 'User-Agent': USER_AGENT, 'Cookie': 'view_mature=true', }) r.raise_for_status() return r def story_url(story): return 'https://www.fimfiction.net/story/%d/' % story #return 'https://localhost:8001/api/story.php?story=%d' % story def chapter_url(chapter): return 'https://www.fimfiction.net/chapters/download/%d/html' % chapter def active_stories(): result = {} for story_name in os.listdir('active'): id_path = os.path.join('active', story_name, 'fimfiction_id.txt') if os.path.exists(id_path): with open(id_path, 'r') as f: story_id = int(f.read().strip()) result[story_name] = story_id return result def get_story_info(story): resp = fetch(story_url(story)) soup = BeautifulSoup(resp.text, 'lxml') return build_metadata(story, soup) AUTHOR_URL_REGEX = re.compile(r'/user/([0-9]+)/[^/]*') DATE_REGEX = re.compile(r'([0-9]+)(?:st|nd|rd|th) ([A-Z][a-z]{2}) ([0-9]{4})') STORY_URL_REGEX = re.compile(r'/story/([0-9]+)/.*') CHAPTER_URL_REGEX = re.compile(r'/story/([0-9]+)/([0-9]+)/.*') CHAPTER_DOWNLOAD_URL_REGEX = re.compile(r'/chapters/download/([0-9]+)/txt') COMPLETED_STATUS_RE = re.compile(r'completed-status-(.*)') def parse_date(s): clean_date_str = '%s %s %s' % DATE_REGEX.match(s).groups() dt = datetime.datetime.strptime(clean_date_str, '%d %b %Y') return dt.replace(tzinfo=datetime.timezone.utc).timestamp() def build_metadata(story, soup): m = {} author_link = soup.select('.user-page-header h1 a')[0] m['author'] = { 'id': int(AUTHOR_URL_REGEX.match(author_link['href']).group(1)), 'name': author_link.string, } tag_list = soup.select('.story_content_box .story-tags')[0] m_categories = {} for link in tag_list.find_all('a'): if any(cls.startswith('tag-') for cls in ' '.join(link.get('class', [])).split()): m_categories[link.string] = True m['categories'] = m_categories chapter_list = soup.select('.chapters')[0] m['chapter_count'] = len(chapter_list) m_chapters = [] for chapter in chapter_list.find_all('li', recursive=False): if chapter.select('.chapter_expander'): continue m_chapter = {} date_str = list(chapter.select('.date')[0].strings)[1].strip() m_chapter['date_modified'] = int(parse_date(date_str)) words_tag = chapter.select('.word-count-number')[0] m_chapter['words'] = int(words_tag.string.replace(',', '')) link = chapter.select('a.chapter-title')[0] m_chapter['title'] = link.string.strip() dl_link = chapter.find('a', href=CHAPTER_DOWNLOAD_URL_REGEX) m_chapter['id'] = int(CHAPTER_DOWNLOAD_URL_REGEX.match(dl_link['href']).group(1)) m_chapters.append(m_chapter) m['chapters'] = m_chapters m['date_modified'] = m_chapters[-1]['date_modified'] if len(m_chapters) > 0 else 0 m['description'] = ' '.join(str(tag) for tag in soup.select('.description-text')[0].children) m['id'] = story status = soup.find(class_=COMPLETED_STATUS_RE) m['status'] = status.string.strip() m['title'] = soup.find('meta', property='og:title')['content'] words_tag = soup.select('.chapters-footer .word_count b')[0] m['words'] = int(words_tag.string.strip().replace(',', '')) # Not in the original API metadata format, but nice to have m_sequels = [] for header in soup.select('.sidebar-header.header-stories'): if next(header.strings, '').strip() != 'Sequels': continue stories = header while stories is not None: if not isinstance(stories, str) and stories.get('class') == ['story-card-list']: break stories = stories.next_element if stories is None: print('warning: saw Sequels header, but no story list') continue for link in stories.select('.story_link'): m_sequel = {} m_sequel['id'] = int(STORY_URL_REGEX.match(link['href']).group(1)) m_sequel['title'] = link.string.strip() m_sequels.append(m_sequel) m['sequels'] = m_sequels return m def get_old_story_info(story_name): path = os.path.join('active', story_name, 'metadata.json') if not os.path.exists(path): return {} with open(path, 'r') as f: return json.load(f) def save_story_info(story_name, info, changed): path = os.path.join('active', story_name, 'metadata.json') with open(path, 'w') as f: json.dump(info, f) if changed: now = int(time.mktime(datetime.datetime.now().timetuple())) ts_path = os.path.join('active', story_name, 'metadata_%010d.json' % now) with open(ts_path, 'w') as f: json.dump(info, f) def fetch_chapter(story_name, chapter_number, chapter_info): url = chapter_url(chapter_info['id']) path = os.path.join('active', story_name, 'Chapter_%07d_%010d_%d.html' % (chapter_info['id'], chapter_info['date_modified'], chapter_info['words'])) if os.path.exists(path): return resp = fetch(url) with open(path, 'w') as f: f.write(resp.text) link_path = os.path.join('active', story_name, 'Chapter_%03d.html' % chapter_number) try: os.unlink(link_path) except OSError: pass os.symlink(os.path.basename(path), link_path) def first_changed(old, new): for k in ('title', 'description', 'status'): if new.get(k) != old.get(k): return k for k in new: if k in ('chapters', 'chapter_count', 'comments', 'likes', 'dislikes', 'views', 'total_views', 'date_modified'): continue if new.get(k) != old.get(k): return k return None def updated_chapters(old, new): old_chapters = old.get('chapters', []) new_chapters = new.get('chapters', []) result = set() for i in range(len(new_chapters)): if i >= len(old_chapters): result.add(i) continue old_chap = old_chapters[i] new_chap = new_chapters[i] if (old_chap['id'] != new_chap['id'] or old_chap['date_modified'] != new_chap['date_modified']): result.add(i) return result def show(x): s = str(x) s = s.replace('\n', ' ') s = s.replace('\r', ' ') s = s.replace('\t', ' ') if len(s) > 50: s = s[:50] + '...' return s def check_story(story, name, force=False): old = get_old_story_info(name) if 'date_modified' in old: age = time.mktime(datetime.datetime.now().timetuple()) - old['date_modified'] age = age // (86400 * 7) if not force and random.randint(0, age) != 0: return False new = get_story_info(story) change = first_changed(old, new) chapters = updated_chapters(old, new) old_chapter_count = len(old.get('chapters', [])) new_chapter_count = len(new.get('chapters', [])) diff_chapter_count = new_chapter_count - old_chapter_count if diff_chapter_count != 0: print('%s (%d): %d new chapter%s' % (name, story, diff_chapter_count, 's' if diff_chapter_count != 1 else '')) old_word_count = old.get('words', 0) new_word_count = new.get('words', 0) diff_word_count = new_word_count - old_word_count if diff_word_count != 0: print('%s (%d): word count changed: %d -> %d (%+d)' % (name, story, old_word_count, new_word_count, diff_word_count)) old_status = old.get('status') new_status = new.get('status') if old_status != new_status: print('%s (%d): status changed: %s -> %s' % (name, story, old_status, new_status)) old_sequel_count = len(old.get('sequels', ())) new_sequel_count = len(new.get('sequels', ())) if old_sequel_count != new_sequel_count: print('%s (%d): sequel count changed: %s -> %s' % (name, story, old_sequel_count, new_sequel_count)) #if len(chapters) > 0: # print('%s (%d): updated %d chapter%s' % (name, story, len(chapters), # 's' if len(chapters) != 1 else '')) #if change is not None: # print('%s (%d): %s changed: %s -> %s' % (name, story, change, # show(old.get(change)), show(new.get(change)))) for i in sorted(chapters): info = new['chapters'][i] print('%s (%d): fetching chapter %d (%d)' % (name, story, i + 1, info['id']), flush=True) fetch_chapter(name, i + 1, info) time.sleep(CHAPTER_DELAY) save_story_info(name, new, new != old) return True def check_stories(stories, force=False): i = 1 count = len(stories) for name, story in sorted(stories.items()): sys.stdout.write('\r\x1b[2K[%2d/%2d] %s\r' % (i, count, name)) sys.stdout.flush() i += 1 try: wait = check_story(story, name, force) except (requests.exceptions.HTTPError, ValueError, KeyError, IndexError) as e: print('%s: %s' % (name, e)) wait = True if wait: time.sleep(STORY_DELAY) def main(): first = True while True: start = time.time() stories = active_stories() print('%s: checking %d stories...' % (datetime.datetime.now().isoformat(), len(stories))) check_stories(stories, force=first) print('\r\x1b[2K%s: done checking %d stories\n' % (datetime.datetime.now().isoformat(), len(stories))) sys.stdout.flush() end = time.time() time.sleep(CHECK_INTERVAL - (end - start)) first = False if __name__ == '__main__': main()