From f3bb0b55ef7743e22406ce3512eecc236209d37c Mon Sep 17 00:00:00 2001 From: Adam Sampson Date: Sat, 22 Dec 2007 01:54:01 +0000 Subject: [PATCH] Add support for matching articles by GUID during an update. --- config | 6 ++++++ rawdoglib/rawdog.py | 39 +++++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/config b/config index ef07f30..6a106f6 100644 --- a/config +++ b/config @@ -188,6 +188,12 @@ tidyhtml false # sorted by the time that rawdog first saw them. sortbyfeeddate false +# Whether to consider articles' unique IDs or GUIDs when updating rawdog's +# database. If you turn this off, then rawdog will create a new article in its +# database when it sees an updated version of an existing article in a feed. +# You probably want this turned on. +useids true + # The fields to use when detecting duplicate articles: "id" is the article's # unique ID or GUID; "link" is the article's link. rawdog will find the first # one of these that's present in the article, and ignore the article if it's diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index 59c6725..dbdeae1 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -1,5 +1,5 @@ # rawdog: RSS aggregator without delusions of grandeur. -# Copyright 2003, 2004, 2005, 2006 Adam Sampson +# Copyright 2003, 2004, 2005, 2006, 2007 Adam Sampson # # rawdog is free software; you can redistribute and/or modify it # under the terms of that license as published by the Free Software @@ -414,6 +414,14 @@ class Feed: self.feed_info = p["feed"] feed = self.url + article_ids = {} + if config["useids"]: + # Find IDs for existing articles. + for (hash, a) in articles.items(): + id = a.entry_info.get("id") + if a.feed == feed and id is not None: + article_ids[id] = a + seen = {} sequence = 0 for entry_info in p["entries"]: @@ -425,9 +433,17 @@ class Feed: seen[article.hash] = True sequence += 1 - if articles.has_key(article.hash): - articles[article.hash].update_from(article, now) - plugins.call_hook("article_updated", rawdog, config, article, now) + id = entry_info.get("id") + if id in article_ids: + existing_article = article_ids[id] + elif article.hash in articles: + existing_article = articles[article.hash] + else: + existing_article = None + + if existing_article is not None: + existing_article.update_from(article, now) + plugins.call_hook("article_updated", rawdog, config, existing_article, now) else: articles[article.hash] = article plugins.call_hook("article_added", rawdog, config, article, now) @@ -486,12 +502,17 @@ class Article: except OverflowError: pass - self.hash = self.compute_hash() + self.hash = self.compute_initial_hash() self.last_seen = now self.added = now - def compute_hash(self): + def compute_initial_hash(self): + """Compute an initial unique hash for an article. + The generated hash must be unique amongst all articles in the + system (i.e. it can't just be the article ID, because that + would collide if more than one feed included the same + article).""" h = sha.new() def add_hash(s): h.update(s.encode("UTF-8")) @@ -512,8 +533,7 @@ class Article: def update_from(self, new_article, now): """Update this article's contents from a newer article that's - been identified to be the same (i.e. has hashed the same, but - might have other changes that aren't part of the hash).""" + been identified to be the same.""" self.entry_info = new_article.entry_info self.sequence = new_article.sequence self.date = new_article.date @@ -653,6 +673,7 @@ class Config: "changeconfig": 0, "numthreads": 0, "splitstate": 0, + "useids": 0, } def __getitem__(self, key): return self.config[key] @@ -771,6 +792,8 @@ class Config: self["numthreads"] = int(l[1]) elif l[0] == "splitstate": self["splitstate"] = parse_bool(l[1]) + elif l[0] == "useids": + self["useids"] = parse_bool(l[1]) elif l[0] == "include": self.load(l[1], False) elif plugins.call_hook("config_option_arglines", self, l[0], l[1], arglines): -- 2.35.1