summaryrefslogtreecommitdiffstats
path: root/utility/articleenhancer/articleenhancer.php
blob: d7701d53bfc54e43bfc4d0e116cf774009f3ce67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
<?php

/**
* ownCloud - News
*
* @author Alessandro Cosentino
* @author Bernhard Posselt
* @copyright 2012 Alessandro Cosentino cosenal@gmail.com
* @copyright 2012 Bernhard Posselt dev@bernhard-posselt.com
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE
* License as published by the Free Software Foundation; either
* version 3 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU AFFERO GENERAL PUBLIC LICENSE for more details.
*
* You should have received a copy of the GNU Affero General Public
* License along with this library.  If not, see <http://www.gnu.org/licenses/>.
*
*/

namespace OCA\News\Utility\ArticleEnhancer;

use \OCA\News\Utility\SimplePieFileFactory;


abstract class ArticleEnhancer {


	private $feedRegex;
	private $articleUrlRegex;
	private $articleXPath;
	private $purifier;
	private $fileFactory;
	private $maximumTimeout;


	/**
	 * @param $purifier the purifier object to clean the html which will be
	 * matched
	 * @param SimplePieFileFactory a factory for getting a simple pie file instance
	 * @param string $articleUrlRegex the regex to match which article should be
	 * handled
	 * @param string $articleXPath the xpath which tells the fetcher with what
	 * body the feed should be replaced
	 * @param int $maximumTimeout maximum timeout in seconds
	 */
	public function __construct($purifier, SimplePieFileFactory $fileFactory, 
	                            $articleUrlRegex, $articleXPath, 
	                            $maximumTimeout=10){
		$this->purifier = $purifier;
		$this->articleUrlRegex = $articleUrlRegex;
		$this->articleXPath = $articleXPath;
		$this->fileFactory = $fileFactory;
		$this->timeout = $maximumTimeout;
	}


	public function canHandle($item){
		return preg_match($this->articleUrlRegex, $item->getUrl()) == true;
	}


	public function enhance($item){
		$file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout);
		$dom = new \DOMDocument();
		@$dom->loadHTML($file->body);
		$xpath = new \DOMXpath($dom);
		$xpathResult = $xpath->evaluate($this->articleXPath);

		// in case it wasnt a text query assume its a single 
		if(!is_string($xpathResult)) {
			$xpathResult = $this->domToString($xpathResult);
		}

		$sanitizedResult = $this->purifier->purify($xpathResult);
		$item->setBody($sanitizedResult);


		return $item;
	}


	/**
	 * Method which turns an xpath result to a string
	 * Assumes that the result matches a single element. If the result 
	 * is not a single element, you can customize it by overwriting this
	 * method
	 * @param $xpathResult the result from the xpath query
	 * @return the result as a string
	 */
	protected function domToString($xpathResult) {
		if($xpathResult->length > 0) {
			return $this->toInnerHTML($xpathResult->item(0));
		} else {
			return "";
		}
	}


	protected function toInnerHTML($node) {
		$dom = new \DOMDocument();     
		$dom->appendChild($dom->importNode($node, true));
		return trim($dom->saveHTML());
	}


}