Current File : //var/webuzo-data/roundcube/vendor/roundcube/rtf-html-php/src/Html/HtmlFormatter.php |
<?php
namespace RtfHtmlPhp\Html;
use RtfHtmlPhp\Document;
class HtmlFormatter
{
protected $encoding;
protected $defaultFont;
protected $fromhtml = false;
protected $openedTags = [];
protected $output = '';
protected $previousState;
protected $rtfEncoding;
protected $state;
protected $states = [];
/**
* Object constructor.
*
* By default, HtmlFormatter uses HTML_ENTITIES for code conversion.
* You can optionally support a different endoing when creating
* the HtmlFormatter instance.
*
* @param string $encoding Output encoding
*/
public function __construct($encoding = 'HTML-ENTITIES')
{
if (!extension_loaded('mbstring')) {
throw new \Exception("PHP mbstring extension not enabled");
}
if ($encoding != 'HTML-ENTITIES') {
// Check if the encoding is reconized by mbstring extension
if (!in_array($encoding, mb_list_encodings())) {
throw new \Exception("Unsupported encoding: $encoding");
}
}
$this->encoding = $encoding;
}
/**
* Generates HTML output for the document
*
* @param Document $document The document
*
* @return string HTML content
*/
public function format(Document $document)
{
// Clear current output
$this->output = '';
// Keep track of style modifications
$this->previousState = null;
// and create a stack of states
$this->states = [];
// Put an initial standard state onto the stack
$this->state = new State();
array_push($this->states, $this->state);
// Keep track of opened html tags
$this->openedTags = ['span' => false, 'p' => null];
// Begin format
$this->processGroup($document->root);
// Instead of removing opened tags, we close them
$this->output .= $this->openedTags['span'] ? '</span>' : ''; // @phpstan-ignore-line
$this->output .= $this->openedTags['p'] ? '</p>' : ''; // @phpstan-ignore-line
// Remove extra empty paragraph at the end
// TODO: Find the real reason it's there and fix it
$this->output = preg_replace('|<p></p>$|', '', $this->output);
return $this->output;
}
/**
* Registers a font definition.
*
* @param \RtfHtmlPhp\Group $fontGroup A group element with a font definition
*
* @return void
*/
protected function loadFont(\RtfHtmlPhp\Group $fontGroup)
{
$fontNumber = 0;
$font = new Font();
// Loop through children of the font group. The font group
// contains control words with the font number and charset,
// and a control text with the font name.
foreach ($fontGroup->children as $child) {
// Control word
if ($child instanceof \RtfHtmlPhp\ControlWord) {
switch ($child->word) {
case 'f':
$fontNumber = $child->parameter;
break;
// Font family names
case 'froman':
$font->family = "serif";
break;
case 'fswiss':
$font->family = "sans-serif";
break;
case 'fmodern':
$font->family = "monospace";
break;
case 'fscript':
$font->family = "cursive";
break;
case 'fdecor':
$font->family = "fantasy";
break;
// case 'fnil': break; // default font
// case 'ftech': break; // symbol
// case 'fbidi': break; // bidirectional font
case 'fcharset': // charset
$font->charset = $this->getEncodingFromCharset($child->parameter);
break;
case 'cpg': // code page
$font->codepage = $this->getEncodingFromCodepage($child->parameter);
break;
case 'fprq': // Font pitch
$font->fprq = $child->parameter;
break;
}
}
// Control text contains the font name, if any:
if ($child instanceof \RtfHtmlPhp\Text) {
// Store font name (except ; delimiter at end)
$font->name = substr($child->text, 0, -1);
}
/*
elseif ($child instanceof \RtfHtmlPhp\Group) {
// possible subgroups:
// '{\*' \falt #PCDATA '}' = alternate font name
// '{\*' \fontemb <fonttype> <fontfname>? <data>? '}'
// '{\*' \fontfile <codepage>? #PCDATA '}'
// '{\*' \panose <data> '}'
continue;
} elseif ($child instanceof \RtfHtmlPhp\ControlSymbol) {
// the only authorized symbol here is '*':
// \*\fname = non tagged file name (only WordPad uses it)
continue;
}
*/
}
State::setFont($fontNumber, $font);
}
protected function extractFontTable($fontTblGrp)
{
// {' \fonttbl (<fontinfo> | ('{' <fontinfo> '}'))+ '}'
// <fontnum><fontfamily><fcharset>?<fprq>?<panose>?
// <nontaggedname>?<fontemb>?<codepage>? <fontname><fontaltname>? ';'
// The Font Table group contains the control word "fonttbl" and some
// subgroups. Go through the subgroups, ignoring the "fonttbl"
// identifier.
foreach ($fontTblGrp->children as $child) {
// Ignore non-group, which should be the fonttbl identified word.
if (!($child instanceof \RtfHtmlPhp\Group)) {
continue;
}
// Load the font specification in the subgroup:
$this->loadFont($child);
}
}
protected function extractColorTable($colorTblGrp)
{
// {\colortbl;\red0\green0\blue0;}
// Index 0 of the RTF color table is the 'auto' color
$colortbl = [];
$c = count($colorTblGrp);
$color = '';
for ($i=1; $i<$c; $i++) { // Iterate through colors
if ($colorTblGrp[$i] instanceof \RtfHtmlPhp\ControlWord) {
// Extract RGB color and convert it to hex string
$color = sprintf(
'#%02x%02x%02x', // hex string format
$colorTblGrp[$i]->parameter, // red
$colorTblGrp[$i+1]->parameter, // green
$colorTblGrp[$i+2]->parameter // blue
);
$i+=2;
} elseif ($colorTblGrp[$i] instanceof \RtfHtmlPhp\Text) {
// This is a delimiter ';' so
if ($i != 1) { // Store the already extracted color
$colortbl[] = $color;
} else { // This is the 'auto' color
$colortbl[] = 0;
}
}
}
State::$colortbl = $colortbl;
}
protected function extractImage($pictGrp)
{
$image = new Image();
foreach ($pictGrp as $child) {
if ($child instanceof \RtfHtmlPhp\ControlWord) {
switch ($child->word) {
// Picture Format
case "emfblip":
$image->format = 'emf';
break;
case "pngblip":
$image->format = 'png';
break;
case "jpegblip":
$image->format = 'jpeg';
break;
case "macpict":
$image->format = 'pict';
break;
// case "wmetafile": $Image->format = 'bmp'; break;
// Picture size and scaling
case "picw":
$image->width = $child->parameter;
break;
case "pich":
$image->height = $child->parameter;
break;
case "picwgoal":
$image->goalWidth = $child->parameter;
break;
case "pichgoal":
$image->goalHeight = $child->parameter;
break;
case "picscalex":
$image->pcScaleX = $child->parameter;
break;
case "picscaley":
$image->pcScaleY = $child->parameter;
break;
// Binary or Hexadecimal Data ?
case "bin":
$image->binarySize = $child->parameter;
break;
}
} elseif ($child instanceof \RtfHtmlPhp\Text) {
// store Data
$image->imageData = $child->text;
}
}
// output Image
$this->output .= $image->printImage();
}
protected function processGroup($group)
{
// Special group processing:
switch ($group->getType()) {
case "fonttbl": // Extract font table
$this->extractFontTable($group);
return;
case "colortbl": // Extract color table
$this->extractColorTable($group->children);
return;
case "stylesheet":
// Stylesheet extraction not yet supported
return;
case "info":
// Ignore Document information
return;
case "pict":
$this->extractImage($group->children);
return;
case "nonshppict":
// Ignore alternative images
return;
case "*": // Process destination
$this->processDestination($group->children);
return;
}
// Pictures extraction not yet supported
// if (substr($group->GetType(), 0, 4) == "pict") { return; }
// Push a new state onto the stack:
$this->state = clone $this->state;
array_push($this->states, $this->state);
foreach ($group->children as $child) {
$this->formatEntry($child);
}
// Pop state from stack
array_pop($this->states);
$this->state = $this->states[count($this->states) - 1];
}
protected function processDestination($dest)
{
if (!$dest[1] instanceof \RtfHtmlPhp\ControlWord) {
return;
}
// Check if this is a Word 97 picture
if ($dest[1]->word == "shppict") {
$c = count($dest);
for ($i = 2; $i < $c; $i++) {
$this->formatEntry($dest[$i]);
}
} elseif ($dest[1]->word == "htmltag") {
$c = count($dest);
for ($i = 2; $i < $c; $i++) {
$entry = $dest[$i];
if ($entry instanceof \RtfHtmlPhp\Text) {
$this->output .= $entry->text;
} else {
$this->formatEntry($entry);
}
}
}
}
protected function formatEntry($entry)
{
if ($entry instanceof \RtfHtmlPhp\Group) {
$this->processGroup($entry);
} elseif ($entry instanceof \RtfHtmlPhp\ControlWord) {
$this->formatControlWord($entry);
} elseif ($entry instanceof \RtfHtmlPhp\ControlSymbol) {
$this->formatControlSymbol($entry);
} elseif ($entry instanceof \RtfHtmlPhp\Text) {
$this->formatText($entry);
}
}
protected function formatControlWord($word)
{
switch($word->word) {
case 'fromhtml':
$this->fromhtml = $word->parameter > 0;
break;
case 'htmlrtf':
$this->state->htmlrtf = $word->parameter > 0;
break;
case 'plain': // Reset font formatting properties to default.
case 'pard': // Reset to default paragraph properties.
$this->state->reset($this->defaultFont);
break;
// Font formatting properties:
case 'b': // bold
$this->state->bold = $word->parameter;
break;
case 'i': // italic
$this->state->italic = $word->parameter;
break;
case 'ul': // underline
$this->state->underline = $word->parameter;
break;
case 'ulnone': // no underline
$this->state->underline = false;
break;
case 'strike': // strike-through
$this->state->strike = $word->parameter;
break;
case 'v': // hidden
$this->state->hidden = $word->parameter;
break;
case 'fs': // Font size
$this->state->fontsize = ceil(($word->parameter / 24) * 16);
break;
case 'f': // Font
$this->state->font = $word->parameter;
break;
case 'deff': // Store default font
$this->defaultFont = $word->parameter;
break;
// Colors
case 'cf':
case 'chcfpat':
$this->state->fontcolor = $word->parameter;
break;
case 'cb':
case 'chcbpat':
$this->state->background = $word->parameter;
break;
case 'highlight':
$this->state->hcolor = $word->parameter;
break;
// Special characters
case 'lquote': $this->write($this->fromhtml ? "‘" : "‘"); break; // ‘ ‘
case 'rquote': $this->write($this->fromhtml ? "’" : "’"); break; // ’ ’
case 'ldblquote': $this->write($this->fromhtml ? "“" : "“"); break; // “ “
case 'rdblquote': $this->write($this->fromhtml ? "”" : "”"); break; // ” ”
case 'bullet': $this->write($this->fromhtml ? "•" : "•"); break; // • •
case 'endash': $this->write($this->fromhtml ? "–" : "–"); break; // – –
case 'emdash': $this->write($this->fromhtml ? "—" : "—"); break; // — —
case 'enspace': $this->write($this->fromhtml ? " " : " "); break; //  
case 'emspace': $this->write($this->fromhtml ? " " : " "); break; //  
case 'tab': $this->write($this->fromhtml ? "\t" : " "); break; // Character value 9
case 'line': $this->output .= $this->fromhtml ? "\n" : "<br/>"; break; // character value (line feed = ) (carriage return = )
// Unicode characters
case 'u':
$uchar = $this->decodeUnicode($word->parameter);
$this->write($uchar);
break;
// Paragraphs
case 'par':
case 'row':
if ($this->fromhtml) {
$this->output .= "\n";
break;
}
// Close previously opened tags
$this->closeTags();
// Begin a new paragraph
$this->openTag('p');
break;
// Code pages
case 'ansi':
case 'mac':
case 'pc':
case 'pca':
$this->rtfEncoding = $this->getEncodingFromCodepage($word->word);
break;
case 'ansicpg':
if ($word->parameter) {
$this->rtfEncoding = $this->getEncodingFromCodepage($word->parameter);
}
break;
}
}
protected function decodeUnicode($code, $srcEnc = 'UTF-8')
{
$utf8 = false;
if ($srcEnc != 'UTF-8') { // convert character to Unicode
$utf8 = iconv($srcEnc, 'UTF-8', chr($code));
}
if ($this->encoding == 'HTML-ENTITIES') {
return $utf8 !== false ? "&#{$this->ordUtf8($utf8)};" : "&#{$code};";
}
if ($this->encoding == 'UTF-8') {
return $utf8 !== false ? $utf8 : mb_convert_encoding("&#{$code};", $this->encoding, 'HTML-ENTITIES');
}
return $utf8 !== false ? mb_convert_encoding($utf8, $this->encoding, 'UTF-8') :
mb_convert_encoding("&#{$code};", $this->encoding, 'HTML-ENTITIES');
}
protected function write($txt)
{
// Ignore regions that are not part of the original (encapsulated) HTML content
if ($this->state->htmlrtf) {
return;
}
if ($this->fromhtml) {
$this->output .= $txt;
return;
}
if (!isset($this->openedTags['p'])) {
// Create the first paragraph
$this->openTag('p');
}
// Create a new 'span' element only when a style change occurs.
// 1st case: style change occured
// 2nd case: there is no change in style but the already created 'span'
// element is somehow closed (ex. because of an end of paragraph)
if (!$this->state->equals($this->previousState) || empty($this->openedTags['span'])) {
// If applicable close previously opened 'span' tag
$this->closeTag('span');
$style = $this->state->printStyle();
// Keep track of preceding style
$this->previousState = clone $this->state;
// Create style attribute and open span
$attr = $style ? "style=\"{$style}\"" : "";
$this->openTag('span', $attr);
}
$this->output .= $txt;
}
protected function openTag($tag, $attr = '')
{
// Ignore regions that are not part of the original (encapsulated) HTML content
if ($this->fromhtml) {
return;
}
$this->output .= $attr ? "<{$tag} {$attr}>" : "<{$tag}>";
$this->openedTags[$tag] = true;
}
protected function closeTag($tag)
{
if ($this->fromhtml) {
return;
}
if (!empty($this->openedTags[$tag])) {
// Check for empty html elements
if (substr($this->output, -strlen("<{$tag}>")) == "<{$tag}>") {
switch ($tag) {
case 'p': // Replace empty 'p' element with a line break
$this->output = substr($this->output, 0, -3) . "<br>";
break;
default: // Delete empty elements
$this->output = substr($this->output, 0, -strlen("<{$tag}>"));
break;
}
} else {
$this->output .= "</{$tag}>";
}
$this->openedTags[$tag] = false;
}
}
/**
* Closes all opened tags
*
* @return void
*/
protected function closeTags()
{
// Close all opened tags
foreach ($this->openedTags as $tag => $b) {
$this->closeTag($tag);
}
}
protected function formatControlSymbol($symbol)
{
if ($symbol->symbol == '\'') {
$enc = $this->getSourceEncoding();
$uchar = $this->decodeUnicode($symbol->parameter, $enc);
$this->write($uchar);
} elseif ($symbol->symbol == '~') {
$this->write(" "); // Non breaking space
} elseif ($symbol->symbol == '-') {
$this->write("­"); // Optional hyphen
} elseif ($symbol->symbol == '_') {
$this->write("‑"); // Non breaking hyphen
} elseif ($symbol->symbol == '{') {
$this->write("{"); // Non breaking hyphen
}
}
protected function formatText($text)
{
// Convert special characters to HTML entities
$txt = htmlspecialchars($text->text, ENT_NOQUOTES, 'UTF-8');
if ($this->encoding == 'HTML-ENTITIES') {
$this->write($txt);
} else {
$this->write(mb_convert_encoding($txt, $this->encoding, 'UTF-8'));
}
}
protected function getSourceEncoding()
{
if (isset($this->state->font)) {
if (isset(State::$fonttbl[$this->state->font]->codepage)) {
return State::$fonttbl[$this->state->font]->codepage;
}
if (isset(State::$fonttbl[$this->state->font]->charset)) {
return State::$fonttbl[$this->state->font]->charset;
}
}
return $this->rtfEncoding;
}
/**
* Convert RTF charset identifier into an encoding name (for iconv)
*
* @param int $charset Charset identifier
*
* @return string|null Encoding name or NULL on unknown CodePage
*/
protected function getEncodingFromCharset($charset)
{
// maps windows character sets to iconv encoding names
$map = array (
0 => 'CP1252', // ANSI: Western Europe
1 => 'CP1252', //*Default
2 => 'CP1252', //*Symbol
3 => null, // Invalid
77 => 'MAC', //*also [MacRoman]: Macintosh
128 => 'CP932', //*or [Shift_JIS]?: Japanese
129 => 'CP949', //*also [UHC]: Korean (Hangul)
130 => 'CP1361', //*also [JOHAB]: Korean (Johab)
134 => 'CP936', //*or [GB2312]?: Simplified Chinese
136 => 'CP950', //*or [BIG5]?: Traditional Chinese
161 => 'CP1253', // Greek
162 => 'CP1254', // Turkish (latin 5)
163 => 'CP1258', // Vietnamese
177 => 'CP1255', // Hebrew
178 => 'CP1256', // Simplified Arabic
179 => 'CP1256', //*Traditional Arabic
180 => 'CP1256', //*Arabic User
181 => 'CP1255', //*Hebrew User
186 => 'CP1257', // Baltic
204 => 'CP1251', // Russian (Cyrillic)
222 => 'CP874', // Thai
238 => 'CP1250', // Eastern European (latin 2)
254 => 'CP437', //*also [IBM437][437]: PC437
255 => 'CP437', //*OEM still PC437
);
if (isset($map[$charset])) {
return $map[$charset];
}
return null;
}
/**
* Convert RTF CodePage identifier into an encoding name (for iconv)
*
* @param string $cpg CodePage identifier
*
* @return string|null Encoding name or NULL on unknown CodePage
*/
protected function getEncodingFromCodepage($cpg)
{
$map = array (
'ansi' => 'CP1252',
'mac' => 'MAC',
'pc' => 'CP437',
'pca' => 'CP850',
437 => 'CP437', // United States IBM
708 => 'ASMO-708', // also [ISO-8859-6][ARABIC] Arabic
/* Not supported by iconv
709, => '' // Arabic (ASMO 449+, BCON V4)
710, => '' // Arabic (transparent Arabic)
711, => '' // Arabic (Nafitha Enhanced)
720, => '' // Arabic (transparent ASMO)
*/
819 => 'CP819', // Windows 3.1 (US and Western Europe)
850 => 'CP850', // IBM multilingual
852 => 'CP852', // Eastern European
860 => 'CP860', // Portuguese
862 => 'CP862', // Hebrew
863 => 'CP863', // French Canadian
864 => 'CP864', // Arabic
865 => 'CP865', // Norwegian
866 => 'CP866', // Soviet Union
874 => 'CP874', // Thai
932 => 'CP932', // Japanese
936 => 'CP936', // Simplified Chinese
949 => 'CP949', // Korean
950 => 'CP950', // Traditional Chinese
1250 => 'CP1250', // Windows 3.1 (Eastern European)
1251 => 'CP1251', // Windows 3.1 (Cyrillic)
1252 => 'CP1252', // Western European
1253 => 'CP1253', // Greek
1254 => 'CP1254', // Turkish
1255 => 'CP1255', // Hebrew
1256 => 'CP1256', // Arabic
1257 => 'CP1257', // Baltic
1258 => 'CP1258', // Vietnamese
1361 => 'CP1361', // Johab
);
if (isset($map[$cpg])) {
return $map[$cpg];
}
return null;
}
protected function ordUtf8($chr)
{
$ord0 = ord($chr);
if ($ord0 <= 127) {
return $ord0;
}
$ord1 = ord($chr[1]);
if ($ord0 >= 192 && $ord0 <= 223) {
return ($ord0 - 192) * 64 + ($ord1 - 128);
}
$ord2 = ord($chr[2]);
if ($ord0 >= 224 && $ord0 <= 239) {
return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
}
$ord3 = ord($chr[3]);
if ($ord0 >= 240 && $ord0 <= 247) {
return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2 - 128) * 64 + ($ord3 - 128);
}
$ord4 = ord($chr[4]);
if ($ord0 >= 248 && $ord0 <= 251) {
return ($ord0 - 248) * 16777216 + ($ord1 - 128) * 262144 + ($ord2 - 128) * 4096 + ($ord3 - 128) * 64 + ($ord4 - 128);
}
if ($ord0 >= 252 && $ord0 <= 253) {
return ($ord0 - 252) * 1073741824 + ($ord1 - 128) * 16777216 + ($ord2 - 128) * 262144 + ($ord3 - 128) * 4096 + ($ord4 - 128) * 64 + (ord($chr[5]) - 128);
}
// trigger_error("Invalid Unicode character: {$chr}");
}
}