From d7da5aedc2f7d414713d9c975db51ba8b3d2d6b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Benoist?= Date: Mon, 23 Oct 2017 14:16:00 +0200 Subject: [PATCH 01/10] Update wsj.com.txt (#351) Fix author & date --- wsj.com.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wsj.com.txt b/wsj.com.txt index 75913a0d..08d799f0 100644 --- a/wsj.com.txt +++ b/wsj.com.txt @@ -3,10 +3,10 @@ body: //div[@id='wsj-article-wrap'] # is this still used? body: //div[@id='article_story_body'] -author: //h3[@class='byline']/a +author: //meta[@name="author"]/@content # for slide show content body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] -date: //li[@class='dateStamp']/small +date: //meta[@itemprop="dateCreated"]/@content strip_id_or_class: insetFullBracket strip_id_or_class: insettipBox From b6150980d0b4a60afd09aa6e53d572c75c8330b9 Mon Sep 17 00:00:00 2001 From: Strubbl Date: Sat, 28 Oct 2017 14:14:31 +0200 Subject: [PATCH 02/10] fix heise.de title detection (#353) Fixes the title detection of heise.de articles. This config used default tag, but this tag is polluted with the prefix string "Druckversion - " and sometimes with suffixed e.g with " | heise online" Therefore we replace those strings with nothing. --- heise.de.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/heise.de.txt b/heise.de.txt index cc80203d..b6d36cb3 100644 --- a/heise.de.txt +++ b/heise.de.txt @@ -42,6 +42,12 @@ strip_id_or_class: ad_ # Some optimizations replace_string(<h5>): <h2> replace_string(</h5>): </h2> +replace_string(<title>Druckversion - ): <title> +replace_string( | heise online): +replace_string( | c't Magazin): +replace_string( | Telepolis): +replace_string( | heise Security): +replace_string( | heise Autos): # this line breaks the parser #replace_string():
From b320dc5f5ebcfa726ea7e7532c4b035b658c36f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Benoist?= Date: Sat, 28 Oct 2017 22:42:35 +0200 Subject: [PATCH 03/10] Update lemonde.fr.txt (#354) Still some crazy stuff from lemonde -_- --- lemonde.fr.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lemonde.fr.txt b/lemonde.fr.txt index d5f9a9e9..d38b9b2e 100644 --- a/lemonde.fr.txt +++ b/lemonde.fr.txt @@ -17,6 +17,11 @@ body: //div[@id='articleBody'] find_string: