@@ -0,0 +1,332 @@
< ? php
namespace MailPoetVendor\Html2Text ;
if ( ! defined ( 'ABSPATH' )) exit ;
class Html2Text
{
public static function convert ( $html , $ignore_error = \false )
{
$is_office_document = static :: isOfficeDocument ( $html );
if ( $is_office_document ) {
// remove office namespace
$html = \str_replace ( array ( " <o:p> " , " </o:p> " ), " " , $html );
}
$html = static :: fixNewlines ( $html );
if ( \mb_detect_encoding ( $html , " UTF-8 " , \true )) {
$html = \mb_convert_encoding ( $html , " HTML-ENTITIES " , " UTF-8 " );
}
$doc = static :: getDocument ( $html , $ignore_error );
$output = static :: iterateOverNode ( $doc , null , \false , $is_office_document );
// process output for whitespace/newlines
$output = static :: processWhitespaceNewlines ( $output );
return $output ;
}
static function fixNewlines ( $text )
{
// replace \r\n to \n
$text = \str_replace ( " \r \n " , " \n " , $text );
// remove \rs
$text = \str_replace ( " \r " , " \n " , $text );
return $text ;
}
static function processWhitespaceNewlines ( $text )
{
// remove excess spaces around tabs
$text = \preg_replace ( " / * \t */im " , " \t " , $text );
// remove leading whitespace
$text = \ltrim ( $text );
// remove leading spaces on each line
$text = \preg_replace ( " / \n [ \t ]*/im " , " \n " , $text );
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = \str_replace ( " " , " " , $text );
// remove trailing whitespace
$text = \rtrim ( $text );
// remove trailing spaces on each line
$text = \preg_replace ( " /[ \t ]* \n /im " , " \n " , $text );
// unarmor pre blocks
$text = static :: fixNewLines ( $text );
// remove unnecessary empty lines
$text = \preg_replace ( " / \n \n \n */im " , " \n \n " , $text );
return $text ;
}
static function getDocument ( $html , $ignore_error = \false )
{
$doc = new \DOMDocument ();
$html = \trim ( $html );
if ( ! $html ) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc ;
}
if ( $html [ 0 ] !== '<' ) {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>' ;
}
if ( $ignore_error ) {
$doc -> strictErrorChecking = \false ;
$doc -> recover = \true ;
$doc -> xmlStandalone = \true ;
$old_internal_errors = \libxml_use_internal_errors ( \true );
$load_result = $doc -> loadHTML ( $html , \LIBXML_NOWARNING | \LIBXML_NOERROR | \LIBXML_NONET | \LIBXML_PARSEHUGE );
\libxml_use_internal_errors ( $old_internal_errors );
} else {
$load_result = $doc -> loadHTML ( $html );
}
if ( ! $load_result ) {
throw new Html2TextException ( " Could not load HTML - badly formed? " , $html );
}
return $doc ;
}
static function isOfficeDocument ( $html )
{
return \strpos ( $html , " urn:schemas-microsoft-com:office " ) !== \false ;
}
static function isWhitespace ( $text )
{
return \strlen ( \trim ( $text , " \n \r \t " )) === 0 ;
}
static function nextChildName ( $node )
{
// get the next child
$nextNode = $node -> nextSibling ;
while ( $nextNode != null ) {
if ( $nextNode instanceof \DOMText ) {
if ( ! static :: isWhitespace ( $nextNode -> wholeText )) {
break ;
}
}
if ( $nextNode instanceof \DOMElement ) {
break ;
}
$nextNode = $nextNode -> nextSibling ;
}
$nextName = null ;
if (( $nextNode instanceof \DOMElement || $nextNode instanceof \DOMText ) && $nextNode != null ) {
$nextName = \strtolower ( $nextNode -> nodeName );
}
return $nextName ;
}
static function iterateOverNode ( $node , $prevName = null , $in_pre = \false , $is_office_document = \false )
{
if ( $node instanceof \DOMText ) {
// Replace whitespace characters with a space (equivilant to \s)
if ( $in_pre ) {
$text = " \n " . \trim ( $node -> wholeText , " \n \r \t " ) . " \n " ;
// Remove trailing whitespace only
$text = \preg_replace ( " /[ \t ]* \n /im " , " \n " , $text );
// armor newlines with \r.
return \str_replace ( " \n " , " \r " , $text );
} else {
$text = \preg_replace ( " /[ \\ t \\ n \\ f \\ r ]+/im " , " " , $node -> wholeText );
if ( ! static :: isWhitespace ( $text ) && ( $prevName == 'p' || $prevName == 'div' )) {
return " \n " . $text ;
}
return $text ;
}
}
if ( $node instanceof \DOMDocumentType ) {
// ignore
return " " ;
}
if ( $node instanceof \DOMProcessingInstruction ) {
// ignore
return " " ;
}
$name = \strtolower ( $node -> nodeName );
$nextName = static :: nextChildName ( $node );
// start whitespace
switch ( $name ) {
case " hr " :
$prefix = '' ;
if ( $prevName != null ) {
$prefix = " \n " ;
}
return $prefix . " --------------------------------------------------------------- \n " ;
case " style " :
case " head " :
case " title " :
case " meta " :
case " script " :
// ignore these tags
return " " ;
case " h1 " :
case " h2 " :
case " h3 " :
case " h4 " :
case " h5 " :
case " h6 " :
case " ol " :
case " ul " :
case " pre " :
// add two newlines
$output = " \n \n " ;
break ;
case " td " :
case " th " :
// add tab char to separate table fields
$output = " \t " ;
break ;
case " p " :
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ( $is_office_document && $node -> getAttribute ( 'class' ) == 'MsoNormal' ) {
$output = " " ;
$name = 'br' ;
break ;
}
// add two lines
$output = " \n \n " ;
break ;
case " tr " :
// add one line
$output = " \n " ;
break ;
case " div " :
$output = " " ;
if ( $prevName !== null ) {
// add one line
$output .= " \n " ;
}
break ;
case " li " :
$output = " - " ;
break ;
default :
// print out contents of unknown tags
$output = " " ;
break ;
}
// debug
//$output .= "[$name,$nextName]";
if ( isset ( $node -> childNodes )) {
$n = $node -> childNodes -> item ( 0 );
$previousSiblingNames = array ();
$previousSiblingName = null ;
$parts = array ();
$trailing_whitespace = 0 ;
while ( $n != null ) {
$text = static :: iterateOverNode ( $n , $previousSiblingName , $in_pre || $name == 'pre' , $is_office_document );
// Pass current node name to next child, as previousSibling does not appear to get populated
if ( $n instanceof \DOMDocumentType || $n instanceof \DOMProcessingInstruction || $n instanceof \DOMText && static :: isWhitespace ( $text )) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace ++ ;
} else {
$previousSiblingName = \strtolower ( $n -> nodeName );
$previousSiblingNames [] = $previousSiblingName ;
$trailing_whitespace = 0 ;
}
$node -> removeChild ( $n );
$n = $node -> childNodes -> item ( 0 );
$parts [] = $text ;
}
// Remove trailing whitespace, important for the br check below
while ( $trailing_whitespace -- > 0 ) {
\array_pop ( $parts );
}
// suppress last br tag inside a node list if follows text
$last_name = \array_pop ( $previousSiblingNames );
if ( $last_name === 'br' ) {
$last_name = \array_pop ( $previousSiblingNames );
if ( $last_name === '#text' ) {
\array_pop ( $parts );
}
}
$output .= \implode ( '' , $parts );
}
// end whitespace
switch ( $name ) {
case " h1 " :
case " h2 " :
case " h3 " :
case " h4 " :
case " h5 " :
case " h6 " :
case " pre " :
case " p " :
// add two lines
$output .= " \n \n " ;
break ;
case " br " :
// add one line
$output .= " \n " ;
break ;
case " div " :
break ;
case " a " :
// links are returned in [text](link) format
$href = $node -> getAttribute ( " href " );
$output = \trim ( $output );
// remove double [[ ]] s from linking images
if ( \substr ( $output , 0 , 1 ) == " [ " && \substr ( $output , - 1 ) == " ] " ) {
$output = \substr ( $output , 1 , \strlen ( $output ) - 2 );
// for linking images, the title of the <a> overrides the title of the <img>
if ( $node -> getAttribute ( " title " )) {
$output = $node -> getAttribute ( " title " );
}
}
// if there is no link text, but a title attr
if ( ! $output && $node -> getAttribute ( " title " )) {
$output = $node -> getAttribute ( " title " );
}
if ( $href == null ) {
// it doesn't link anywhere
if ( $node -> getAttribute ( " name " ) != null ) {
$output = " [ { $output } ] " ;
}
} else {
if ( $output ) {
$output = " [ { $output } ]( { $href } ) " ;
} else {
// empty string
$output = $href ;
}
}
// does the next node require additional whitespace?
switch ( $nextName ) {
case " h1 " :
case " h2 " :
case " h3 " :
case " h4 " :
case " h5 " :
case " h6 " :
$output .= " \n " ;
break ;
}
break ;
case " img " :
if ( $node -> getAttribute ( " title " )) {
$output = " [ " . $node -> getAttribute ( " title " ) . " ] " ;
} elseif ( $node -> getAttribute ( " alt " )) {
$output = " [ " . $node -> getAttribute ( " alt " ) . " ] " ;
} else {
$output = " " ;
}
break ;
case " li " :
$output .= " \n " ;
break ;
case " blockquote " :
// process quoted text for whitespace/newlines
$output = static :: processWhitespaceNewlines ( $output );
// add leading newline
$output = " \n " . $output ;
// prepend '> ' at the beginning of all lines
$output = \preg_replace ( " / \n /im " , " \n > " , $output );
// replace leading '> >' with '>>'
$output = \preg_replace ( " / \n > >/im " , " \n >> " , $output );
// add another leading newline and trailing newlines
$output = " \n " . $output . " \n \n " ;
break ;
default :
}
return $output ;
}
}