# How to use:
# 1. Make a directory called `active`
# 2. Define this shell function:
#       addfic() { mkdir active/$1 && echo $2 >active/$1/fimfiction_id.txt; }
# 3. Add stories `addfic TheTitle 12345` (where 12345 is the story ID)
# 4. Run this script from the directory containing `active`
#
# On startup, and every 3 hours, this script will scan for files named
# `active/*/fimfiction_id.txt`, and check for updates on all the stories it
# finds that way.  In each `active/*/` directory, it will create files named
# `metadata.json` and `Chapter_???.html`.  These are symlinks to the latest
# versions of the story metadata and the chapter HTML.  Old versions are never
# deleted.
#
# On startup, the script checks every story for updates.  On 3-hour intervals,
# it checks a random subset of stories, based on what's updated recently.  A
# story that was last updated N weeks ago will be checked with probability 1/N.
# A story that updates weekly will always be checked, and you'll see new
# chapters immediately.  A story that updates monthly will be checked 1/4 of
# the time, so with 8 checks per day, you'll normally see new chapters within a
# day.
#
# Each time a story changes, the script produces output like this, showing what
# fields have changed:
#   ThroughTheAurora (436202): 1 new chapter
#   ThroughTheAurora (436202): word count changed: 160429 -> 162551 (+2122)
#   ThroughTheAurora (436202): status changed: Incomplete -> Complete
#   ThroughTheAurora (436202): fetching chapter 53 (1388734)
# Similar output appears on the first check for newly added stories.

import datetime
import json
import os
import random
import re
import shutil
import sys
import time

from bs4 import BeautifulSoup
import lxml # not used directly, but bs4 needs it installed
import requests


VERSION = 2

CHECK_INTERVAL = 3 * 3600

CHAPTER_DELAY = 1
STORY_DELAY = 10


USER_AGENT = 'curl/7.38.0, fimfic story monitor v%d' % VERSION

def fetch(url):
r = requests.get(url, headers={
'User-Agent': USER_AGENT,
'Cookie': 'view_mature=true',
})
r.raise_for_status()
return r

def story_url(story):
return 'https://www.fimfiction.net/story/%d/' % story
#return 'https://localhost:8001/api/story.php?story=%d' % story

def chapter_url(chapter):
return 'https://www.fimfiction.net/chapters/download/%d/html' % chapter

def active_stories():
result = {}
for story_name in os.listdir('active'):
id_path = os.path.join('active', story_name, 'fimfiction_id.txt')
if os.path.exists(id_path):
with open(id_path, 'r') as f:
story_id = int(f.read().strip())
result[story_name] = story_id
return result

def get_story_info(story):
resp = fetch(story_url(story))
soup = BeautifulSoup(resp.text, 'lxml')
return build_metadata(story, soup)

AUTHOR_URL_REGEX = re.compile(r'/user/([0-9]+)/[^/]*')
DATE_REGEX = re.compile(r'([0-9]+)(?:st|nd|rd|th) ([A-Z][a-z]{2}) ([0-9]{4})')
STORY_URL_REGEX = re.compile(r'/story/([0-9]+)/.*')
CHAPTER_URL_REGEX = re.compile(r'/story/([0-9]+)/([0-9]+)/.*')
CHAPTER_DOWNLOAD_URL_REGEX = re.compile(r'/chapters/download/([0-9]+)/txt')
COMPLETED_STATUS_RE = re.compile(r'completed-status-(.*)')

def parse_date(s):
clean_date_str = '%s %s %s' % DATE_REGEX.match(s).groups()
dt = datetime.datetime.strptime(clean_date_str, '%d %b %Y')
return dt.replace(tzinfo=datetime.timezone.utc).timestamp()

def build_metadata(story, soup):
m = {}

author_link = soup.select('.user-page-header h1 a')[0]
m['author'] = {
'id': int(AUTHOR_URL_REGEX.match(author_link['href']).group(1)),
'name': author_link.string,
}

tag_list = soup.select('.story_content_box .story-tags')[0]
m_categories = {}
for link in tag_list.find_all('a'):
if any(cls.startswith('tag-') for cls in ' '.join(link.get('class', [])).split()):
m_categories[link.string] = True
m['categories'] = m_categories

chapter_list = soup.select('.chapters')[0]
m['chapter_count'] = len(chapter_list)
m_chapters = []
for chapter in chapter_list.find_all('li', recursive=False):
if chapter.select('.chapter_expander'):
continue

m_chapter = {}

date_str = list(chapter.select('.date')[0].strings)[1].strip()
m_chapter['date_modified'] = int(parse_date(date_str))

words_tag = chapter.select('.word-count-number')[0]
m_chapter['words'] = int(words_tag.string.replace(',', ''))

link = chapter.select('a.chapter-title')[0]
m_chapter['title'] = link.string.strip()

dl_link = chapter.find('a', href=CHAPTER_DOWNLOAD_URL_REGEX)
m_chapter['id'] = int(CHAPTER_DOWNLOAD_URL_REGEX.match(dl_link['href']).group(1))

m_chapters.append(m_chapter)

m['chapters'] = m_chapters

m['date_modified'] = m_chapters[-1]['date_modified'] if len(m_chapters) > 0 else 0

m['description'] = ' '.join(str(tag) for tag in
soup.select('.description-text')[0].children)

m['id'] = story

status = soup.find(class_=COMPLETED_STATUS_RE)
m['status'] = status.string.strip()

m['title'] = soup.find('meta', property='og:title')['content']

words_tag = soup.select('.chapters-footer .word_count b')[0]
m['words'] = int(words_tag.string.strip().replace(',', ''))

# Not in the original API metadata format, but nice to have
m_sequels = []
for header in soup.select('.sidebar-header.header-stories'):
if next(header.strings, '').strip() != 'Sequels':
continue

stories = header
while stories is not None:
if not isinstance(stories, str) and stories.get('class') == ['story-card-list']:
break
stories = stories.next_element
if stories is None:
print('warning: saw Sequels header, but no story list')
continue

for link in stories.select('.story_link'):
m_sequel = {}
m_sequel['id'] = int(STORY_URL_REGEX.match(link['href']).group(1))
m_sequel['title'] = link.string.strip()
m_sequels.append(m_sequel)
m['sequels'] = m_sequels

return m

def get_old_story_info(story_name):
path = os.path.join('active', story_name, 'metadata.json')
if not os.path.exists(path):
return {}

with open(path, 'r') as f:
return json.load(f)

def save_story_info(story_name, info, changed):
path = os.path.join('active', story_name, 'metadata.json')
with open(path, 'w') as f:
json.dump(info, f)

if changed:
now = int(time.mktime(datetime.datetime.now().timetuple()))
ts_path = os.path.join('active', story_name, 'metadata_%010d.json' % now)
with open(ts_path, 'w') as f:
json.dump(info, f)


def fetch_chapter(story_name, chapter_number, chapter_info):
url = chapter_url(chapter_info['id'])
path = os.path.join('active', story_name, 'Chapter_%07d_%010d_%d.html' %
(chapter_info['id'], chapter_info['date_modified'], chapter_info['words']))

if os.path.exists(path):
return

resp = fetch(url)
with open(path, 'w') as f:
f.write(resp.text)

link_path = os.path.join('active', story_name,
'Chapter_%03d.html' % chapter_number)
try:
os.unlink(link_path)
except OSError:
pass
os.symlink(os.path.basename(path), link_path)


def first_changed(old, new):
for k in ('title', 'description', 'status'):
if new.get(k) != old.get(k):
return k

for k in new:
if k in ('chapters', 'chapter_count', 'comments', 'likes', 'dislikes',
'views', 'total_views', 'date_modified'):
continue
if new.get(k) != old.get(k):
return k

return None

def updated_chapters(old, new):
old_chapters = old.get('chapters', [])
new_chapters = new.get('chapters', [])

result = set()
for i in range(len(new_chapters)):
if i >= len(old_chapters):
result.add(i)
continue

old_chap = old_chapters[i]
new_chap = new_chapters[i]
if (old_chap['id'] != new_chap['id'] or
old_chap['date_modified'] != new_chap['date_modified']):
result.add(i)

return result


def show(x):
s = str(x)
s = s.replace('\n', ' ')
s = s.replace('\r', ' ')
s = s.replace('\t', ' ')
if len(s) > 50:
s = s[:50] + '...'
return s


def check_story(story, name, force=False):
old = get_old_story_info(name)

if 'date_modified' in old:
age = time.mktime(datetime.datetime.now().timetuple()) - old['date_modified']
age = age // (86400 * 7)
if not force and random.randint(0, age) != 0:
return False

new = get_story_info(story)

change = first_changed(old, new)
chapters = updated_chapters(old, new)


old_chapter_count = len(old.get('chapters', []))
new_chapter_count = len(new.get('chapters', []))
diff_chapter_count = new_chapter_count - old_chapter_count

if diff_chapter_count != 0:
print('%s (%d): %d new chapter%s' % (name, story, diff_chapter_count,
's' if diff_chapter_count != 1 else ''))


old_word_count = old.get('words', 0)
new_word_count = new.get('words', 0)
diff_word_count = new_word_count - old_word_count

if diff_word_count != 0:
print('%s (%d): word count changed: %d -> %d (%+d)' %
(name, story, old_word_count, new_word_count, diff_word_count))


old_status = old.get('status')
new_status = new.get('status')
if old_status != new_status:
print('%s (%d): status changed: %s -> %s' % (name, story,
old_status, new_status))


old_sequel_count = len(old.get('sequels', ()))
new_sequel_count = len(new.get('sequels', ()))
if old_sequel_count != new_sequel_count:
print('%s (%d): sequel count changed: %s -> %s' % (name, story,
old_sequel_count, new_sequel_count))


#if len(chapters) > 0:
#    print('%s (%d): updated %d chapter%s' % (name, story, len(chapters),
#        's' if len(chapters) != 1 else ''))

#if change is not None:
#    print('%s (%d): %s changed: %s -> %s' % (name, story, change,
#        show(old.get(change)), show(new.get(change))))

for i in sorted(chapters):
info = new['chapters'][i]
print('%s (%d): fetching chapter %d (%d)' % (name, story, i + 1,
info['id']), flush=True)
fetch_chapter(name, i + 1, info)
time.sleep(CHAPTER_DELAY)

save_story_info(name, new, new != old)

return True


def check_stories(stories, force=False):
i = 1
count = len(stories)
for name, story in sorted(stories.items()):
sys.stdout.write('\r\x1b[2K[%2d/%2d] %s\r' % (i, count, name))
sys.stdout.flush()
i += 1
try:
wait = check_story(story, name, force)
except (requests.exceptions.HTTPError, ValueError, KeyError, IndexError) as e:
print('%s: %s' % (name, e))
wait = True
if wait:
time.sleep(STORY_DELAY)


def main():
first = True
while True:
start = time.time()

stories = active_stories()
print('%s: checking %d stories...' % (datetime.datetime.now().isoformat(),
len(stories)))
check_stories(stories, force=first)
print('\r\x1b[2K%s: done checking %d stories\n' %
(datetime.datetime.now().isoformat(), len(stories)))
sys.stdout.flush()

end = time.time()
time.sleep(CHECK_INTERVAL - (end - start))

first = False

if __name__ == '__main__':
main()