diff --git a/src/Document/Native.php b/src/Document/Native.php index b2c256f..10850d9 100644 --- a/src/Document/Native.php +++ b/src/Document/Native.php @@ -48,6 +48,8 @@ public function __construct($html) { * and not the new HTML5 form: * * with the result that parsed strings can have funny characters. + * + * We should set default charset if no charset is present. Generally it should be UTF-8 * * @see http://www.glenscott.co.uk/blog/html5-character-encodings-and-domdocument-loadhtml-and-loadhtmlfile * @see https://github.com/glenscott/dom-document-charset/blob/master/DOMDocumentCharset.php @@ -60,6 +62,15 @@ protected function _fixCharset() { $this->_html ); } + + if (stripos($this->_html, 'charset=')===false) + { + $this->_html = preg_replace( + '~(
]*>)~', + '$1_html + ); + } }