powered by nequal
Home » Diggin_Scraper_Adapter_Htmlscraping » Timeline » 2453

Diffs

Diggin_Scraper_Adapter_Htmlscraping/trunk/tests/Diggin/Scraper/Adapter/HtmlscrapingTest.php

@@ -1,13 +1,10 @@
<?php
require_once 'PHPUnit/Framework/TestCase.php';
-
require_once 'Diggin/Scraper/Adapter/Htmlscraping.php';
-
require_once 'Zend/Http/Response.php';
/**
* Test class for Diggin_Scraper_Adapter_Htmlscraping.
- * Generated by PHPUnit on 2008-12-14 at 13:29:52.
*/
class Diggin_Scraper_Adapter_HtmlscrapingTest extends PHPUnit_Framework_TestCase
{
@@ -19,11 +16,11 @@
protected $response;
+    public static $environmentCheck;
+
/**
* Sets up the fixture, for example, opens a network connection.
* This method is called before a test is executed.
-     *
-     * @access protected
*/
protected function setUp()
{
@@ -47,35 +44,46 @@
$response_str = "$responseHeader\r\n\r\n$responseBody";
$this->response = Zend_Http_Response::fromString($response_str);
+
+        if (!self::$environmentCheck) {
+            $this->environmentCheck();
+            self::$environmentCheck = true;
+        }
}
-    /**
-     * Tears down the fixture, for example, closes a network connection.
-     * This method is called after a test is executed.
-     *
-     * @access protected
-     */
protected function tearDown()
+    {}
+
+    public function environmentCheck()
{
+        if (extension_loaded('tidy')) {
+            $this->assertInternalType('string', $this->object->getXhtml($this->response));
+        } else {
+            try {
+                $this->object->getXhtml($this->response);
+                $this->fail('IF tidy is not available, should raise Exception ');
+            } catch (Diggin_Scraper_Adapter_HtmlscrapingEnvironmentException $e) {
+                $vendor2 = dirname(dirname(dirname(__DIR__))).'/vendor2';
+                set_include_path($vendor2.PATH_SEPARATOR.get_include_path());
+                require_once 'HTMLParser.class.php';
+                $this->assertInternalType('string', $this->object->getXhtml($this->response));
+            }
+        }
}
-    /**
-     * @todo Implement testGetXmlObject().
-     */
-    public function testGetXmlObject() {
-        // Remove the following lines when you implement this test.
-        $this->markTestIncomplete(
-          'This test has not been implemented yet.'
-        );
+
+    public function testGetXmlObject()
+    {
+        $sxml = $this->object->getSimplexml($this->response);
+        $this->assertInstanceOf('Diggin_Scraper_Adapter_Wrapper_SimpleXMLElement', $sxml);
}
public function testGetXhtml()
{
-        //$this->object->getXhtml($response);
// Remove the following lines when you implement this test.
$this->markTestIncomplete(
-          'This test has not been implemented yet.'
+          'Should check valid HTML.'
);
}
@@ -108,9 +116,6 @@
$this->assertEquals('&amp;', $xh2[1]);
}
-    /**
-     *
-     */
public function testReadData() {
$this->object->setConfig(array('url' => 'http://test.org/'));

Diggin_Scraper_Adapter_Htmlscraping/trunk/library/Diggin/Scraper/Adapter/Htmlscraping.php

@@ -3,6 +3,10 @@
* This class is remodeling of HTMLScraping
*
* @see http://www.rcdtokyo.com/etc/htmlscraping/
+ *
+ * This class require
+ *  tidy-extension(recommended) or
+ *  HTMLParser class(http://www.rcdtokyo.com/ucb/contents/i000799.php)
*/
/**
@@ -56,6 +60,8 @@
* @var integer
*/
private $backup_count = 0;
+
+    private $_xhtmltransitionalDtd;
/**
* Casts a SimpleXMLElement
@@ -234,12 +240,14 @@
if ($this->config['pre_ampersand_escape']) {
$responseBody = str_replace('&', '&amp;', $responseBody);
}
-            //?
-            $responseBody = str_replace('&', '&amp;', $responseBody);
-            require_once 'HTMLParser.class.php';
+
+            // use autoload if available
+            if (!class_exists('HTMLParser')) {
+                require_once 'Diggin/Scraper/Adapter/HtmlscrapingEnvironmentException.php';
+                throw new Diggin_Scraper_Adapter_HtmlscrapingEnvironmentException('require tidy or HTMLParser class');
+            }
$parser = new HTMLParser;
-            $format_rule = require 'xhtml1-transitional_dtd.inc.php';
-            $parser->setRule($format_rule);
+            $parser->setRule($this->loadXhtmlTransitionalDtd());
$parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
$parser->setGenericParent('body');
$parser->parse($responseBody);
@@ -255,6 +263,21 @@
return "$declarations$responseBody";
}
+
+    /**
+     * lazy load xhtml Transitional
+     *
+     * avoid memory leak
+     * @see http://amaxi.sitemix.jp/blog/archives/205
+     */
+    public function loadXhtmlTransitionalDtd()
+    {
+        if (!$this->_xhtmltransitionalDtd) {
+            $this->_xhtmltransitionalDtd = require 'xhtml1-transitional_dtd.inc.php';
+        }
+
+        return $this->_xhtmltransitionalDtd;
+    }
/**
* backup (Html and Xml comment)

Diggin_Scraper_Adapter_Htmlscraping/trunk/library/Diggin/Scraper/Adapter/HtmlscrapingEnvironmentException.php

@@ -0,0 +1,6 @@
+<?php
+require_once 'Diggin/Scraper/Adapter/Exception.php';
+
+class Diggin_Scraper_Adapter_HtmlscrapingEnvironmentException
+    extends Diggin_Scraper_Adapter_Exception
+{}