~deimos/madcow-bot

c890bc512a5a0a8d3747763bdc3340090ddfc909 — Chris Jones 8 years ago 29b716b
wip
A contrib/django-memebot/gruntle/fix-busted-slack-links.py => contrib/django-memebot/gruntle/fix-busted-slack-links.py +137 -0
@@ 0,0 1,137 @@
#!/usr/bin/env python

import sys
import os
import re

import logging as log
log.basicConfig(level=log.ERROR, stream=sys.stderr)

from django.core.management import setup_environ
import settings

setup_environ(settings)

from memebot.models import Link, Source
from memebot.utils.browser import Browser
from memebot.scanner import get_scanners

bad_re = re.compile(r'[>]$')

browser = Browser(user_agent='firefox', timeout=20, max_read=2097152)


scanners = get_scanners(settings.SCANNERS)

def resolve_url(link):
    """Handler for a single link"""
    # fetch url we are processing
    try:
        response = browser.open(link.url, follow_meta_redirect=True)
    except:
        print 'timeout'
        return
    if not response.is_valid:
        print 'invalid response'
        return

    # run through each configured scanner until something matches
    for scanner_name, scanner in scanners:
        try:
            result = scanner.scan(response, log, browser)
            break
        except:
            pass
    else:
        print 'no handler'
        return

    log.info('MATCH on %s: %r', scanner_name, result)

    return scanner_name, result

    '''
    # store rendered results from scanners to link and publish (deferred)

    # XXX some seriously broken shit going on witih emoji combinatorials, hack to make the links flow again
    #link.title = result.title

    '''

def main():
    #sources = Source.objects.all()
    links = Link.objects.all()
    slack_links = links.filter(source__name='hugs', source__type='slack')
    lookups = {}
    fields = 'url', 'normalized', 'resolved_url'

    def add_link(link):
        for field in fields:
            lookup = lookups.setdefault(field, {})
            val = getattr(link, field)
            lookup.setdefault(val, []).append(link)

    map(add_link, slack_links)

    bad_links = slack_links.filter(url__endswith='>')
    nbad_link = bad_links.count()
    for i, link in enumerate(bad_links):
        print i + 1, '/', nbad_link
        new_url = bad_re.sub('', link.url)
        if new_url == link.url:
            print 'not actually bad for some reason'
        else:
            sibs = lookups['url'].get(new_url)
            if sibs is not None:
                print 'skipping fixup because the fixed url already exists'
            else:
                link.url = new_url
                new_normalized = Link.objects.normalize_url(new_url)
                sibs = lookups['normalized'].get(new_normalized)
                if sibs is None:
                    ok = True
                else:
                    temp = set(sibs)
                    if link in temp:
                        temp.remove(link)
                    if temp:
                        ok = False
                    else:
                        ok = True
                if not ok:
                    print 'skipping because the new normalized link already exists'
                else:
                    attr = None
                    if link.resolved_url is not None:
                        result = resolve_url(link)
                        if result is not None:
                            scanner_name, result = result
                            link.resolved_url = result.resolved_url
                            link.content_type = result.content_type
                            link.content = result.content
                            safe_title = result.title
                            if safe_title is not None:
                                if not isinstance(safe_title, unicode):
                                    if not isinstance(safe_title, str):
                                        safe_title = str(safe_title)
                                    safe_title = safe_title.decode('latin1')
                                safe_title = safe_title.encode('ascii', 'ignore')
                                safe_title = safe_title.decode('ascii')
                                safe_title = safe_title.strip()
                                if not safe_title:
                                    safe_title = None
                            link.title = safe_title
                            link.scanner = scanner_name
                            attr = result.attr
                    link.publish(commit=False)
                    link.save()
                    if attr is not None:
                        for key, val in result.attr.iteritems():
                            link.attr[key] = val
                    add_link(link)
                    print 'fixed', link

    return 0

if __name__ == '__main__':
    sys.exit(main())

M contrib/django-memebot/gruntle/memebot/models.py => contrib/django-memebot/gruntle/memebot/models.py +3 -2
@@ 53,7 53,8 @@ class Source(Model):

    # valid locations
    SOURCE_TYPES = [('irc', 'IRC'),
                    ('web', 'Web')]
                    ('web', 'Web'),
                    ('slack', 'SlackAPI')]

    # fields
    type = models.CharField(null=False, blank=False, max_length=16, choices=SOURCE_TYPES)


@@ 64,7 65,7 @@ class Source(Model):
        unique_together = 'type', 'name'

    def __unicode__(self):
        return u'%s (%s)' % (self.name, self.type)
        return u'%s (%s)' % (self.name, self.get_type_display())


class LinkManager(models.Manager):