From: Jan-Pascal van Best Date: Thu, 11 Feb 2016 18:48:58 +0000 (+0100) Subject: For streaming tweets, filter for search text in limited fields X-Git-Tag: v1.0~1 X-Git-Url: http://www.vanbest.org/gitweb/?a=commitdiff_plain;h=f69695e99b3985dd30405433f7bda055402132d6;p=tweet_django For streaming tweets, filter for search text in limited fields --- diff --git a/tweet/streamrunner.py b/tweet/streamrunner.py index 1177100..0b75fe3 100644 --- a/tweet/streamrunner.py +++ b/tweet/streamrunner.py @@ -1,6 +1,8 @@ # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 +import json import logging +import re import threading import time @@ -15,6 +17,7 @@ logger = logging.getLogger(__name__) _twitter = None _thread = None +_match_pattern = None class Streamer(TwythonStreamer): count = 0 @@ -23,6 +26,7 @@ class Streamer(TwythonStreamer): #logger.info("Stream text: "+data['text']) #logger.info("Stream tweet: {}".format(data)) tweet = Tweet.from_status(data) + tweet.conforms_to_terms = check_match(data) tweet.save() self.count += 1 if self.count%100 == 0: @@ -40,10 +44,14 @@ class Streamer(TwythonStreamer): def start_stream(terms): global _twitter global _thread + global _match_pattern logger.info("Starting Twitter stream for {}...".format(terms)) if len(terms)==0: logger.warning("No terms given for twitter stream, not starting stream...") return + + _match_pattern = re.compile("|".join(terms), flags=re.IGNORECASE) + _twitter = Streamer(settings.TWEET_OAUTH_CONSUMER_KEY, settings.TWEET_OAUTH_CONSUMER_SECRET, settings.TWEET_OAUTH_ACCESS_TOKEN, @@ -77,14 +85,101 @@ def run_stream(): time.sleep(5) start_stream(terms) - logger.info('Running threads:') - for t in threading.enumerate(): - logger.info(" {}".format(t.name)) - - def export_tweets(filename): exporter = ExcelExporter(filename) - tweets = Tweet.objects.all() + tweets = Tweet.objects.filter(conforms_to_terms=True).order_by('-pk') for tweet in tweets: exporter.add_tweet(tweet) exporter.close() + +def check_match(status): + result = False + if _match_pattern.search(status["text"]) is not None: + result = True + if result: + #logger.debug("Terms found in text") + #logger.debug(" \"{}\"".format(status["text"])) + return result + + if "entities" in status: + if "urls" in status["entities"]: + for ue in status["entities"]["urls"]: + if _match_pattern.search(ue["display_url"]) is not None: + result = True + if _match_pattern.search(ue["expanded_url"]) is not None: + result = True + + if result: +# logger.debug("Terms found in URL entities") +# for ue in status["entities"]["urls"]: +# logger.debug(" "+ue["display_url"]) +# logger.debug(" "+ue["expanded_url"]) + return result + + if "media" in status["entities"]: + for ue in status["entities"]["media"]: + if _match_pattern.search(ue["display_url"]) is not None: + result = True + if _match_pattern.search(ue["expanded_url"]) is not None: + result = True + if result: +# logger.debug("Terms found in media URLs") +# for ue in status["entities"]["media"]: +# logger.debug(" "+ue["display_url"]) +# logger.debug(" "+ue["expanded_url"]) + return result + + if "hashtags" in status["entities"]: + for ue in status["entities"]["hashtags"]: + if _match_pattern.search(ue["text"]) is not None: + result = True + if result: +# logger.debug("Terms found in hashtags") +# for ue in status["entities"]["hashtags"]: +# logger.debug(" "+ue["text"]) + return result + + if "user_mentions" in status["entities"]: + for ue in status["entities"]["user_mentions"]: + if _match_pattern.search(ue["screen_name"]) is not None: + result = True + if _match_pattern.search(ue["name"]) is not None: + result = True + if result: +# logger.debug("Terms found in user_mentions") +# for ue in status["entities"]["user_mentions"]: +# logger.debug(" "+ue["screen_name"]) +# logger.debug(" "+ue["name"]) + return result + + logger.debug("Terms NOT FOUND in tweet:") + logger.debug(" created_at: " + status["created_at"]) + logger.debug(" text: " + status["text"]) + +# if "entities" in status: +# if "urls" in status["entities"]: +# logger.debug(" Terms not found in URL entities") +# for ue in status["entities"]["urls"]: +# logger.debug(" "+ue["display_url"]) +# logger.debug(" "+ue["expanded_url"]) +# +# if "media" in status["entities"]: +# logger.debug(" Terms not found in Media entities") +# for ue in status["entities"]["media"]: +# logger.debug(" "+ue["display_url"]) +# logger.debug(" "+ue["expanded_url"]) +# +# if "hashtags" in status["entities"]: +# logger.debug(" Terms not found in Hashtag entities") +# for ue in status["entities"]["hashtags"]: +# logger.debug(" "+ue["text"]) +# +# if "user_mentions" in status["entities"]: +# logger.debug(" Terms not found in User mention entities") +# for ue in status["entities"]["user_mentions"]: +# logger.debug(" "+ue["screen_name"]) +# logger.debug(" "+ue["name"]) + +# logger.debug(json.dumps(status)) + + return False diff --git a/tweet/views.py b/tweet/views.py index 041a0bf..4b4587a 100644 --- a/tweet/views.py +++ b/tweet/views.py @@ -110,7 +110,7 @@ def download_log(request, job_id): return response def list_stream(request): - tweets = Tweet.objects.all().order_by('-pk') + tweets = Tweet.objects.filter(conforms_to_terms=True).order_by('-pk') total_tweets = tweets.count() terms = Settings.get().stream_terms