This article mainly introduces the method of converting HTML pages into word and saving them in PHP. It analyzes the functions and usage of the PHPWord tool in the form of examples. It has certain reference value. Friends in need can refer to it
The example in this article describes how PHP converts HTML pages into word and saves them. Share it with everyone for your reference, the details are as follows:
A PHP tool is used here called: PHPWord.
The principle of generating Word is to compress the specified xml into a zip package and change the suffix name to doc or docx.
So to use PHPWord, you need to install the zip.dll compression extension in your PHP environment. I wrote a demo.
Function description:
20150507 — Obtaining
tags and
require_once 'PHPWord.php'; require_once 'SimpleHtmlDom.class.php'; class Word{ private $url; private $LinetextArr = array(); public $CurrentDir; public $error = array(); //错误数组 public $filename = null; public $Allowtag = "p,ol,ul,table"; /**数据统计**/ public $DownImg = 0; public $expendTime = 0; public $HttpRequestTime = 0; public $ContentLen = 0; public $HttpRequestArr = array(); public $expendmemory = 0; public function __construct($url) { $startTime = $this->_Time(); $startMemory = $this->_memory(); $this->url = $url; $UrlArr = parse_url($this->url); $this->host = $UrlArr["scheme"]."://".$UrlArr['host']; $this->CurrentDir = getcwd(); $this->LinetextArr["table"] = array(); $html = new simple_html_dom($this->url); $this->HttpRequestArr[] = $this->url; $this->HttpRequestTime++; foreach($html->find($this->Allowtag) as $key=>$value) { if($value->tag == "table") { $this->ParseTable($value,0,$this->LinetextArr["table"]); } else { $this->AnalysisHtmlDom($value); } $this->error[] = error_get_last(); } $endTime = $this->_Time(); $endMemory = $this->_memory(); $this->expendTime = round(($endTime-$startTime),2); //微秒 $this->expendmemory = round(($endMemory-$startMemory)/1000,2); //bytes $this->CreateWordDom(); } private function _Time() { return array_sum(explode(" ", microtime())); } private function _memory() { return memory_get_usage(); } /** * 解析HTML中的Table,这里考虑到多层table嵌套的情况 * @param $value HTMLDOM * @param $i 遍历层级 * **/ private function ParseTable($value,$i,$Arr) { if($value->firstChild() && in_array($value->firstChild()->tag,array("table","tbody","thead","tfoot","tr"))) { foreach($value->children as $k=>$v) { $this->ParseTable($v,$i++,$Arr); } } else { foreach($value->children as $k=>$v) { if($v->firstChild() && $v->firstChild()->tag != "table") { $Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } if(!$v->firstChild()) { $Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } } } } /** * 解析HTML里面的表情 * @param $value HTMLDOM * **/ private function AnalysisHtmlDom($value) { $tmp = array(); if($value->has_child()) { foreach($value->children as $k=>$v) { $this->AnalysisHtmlDom($v); } } else { if($value->tag == "a") { $tmp = array("tag"=>$value->tag,"href"=>$value->href,"text"=>$value->innertext); } else if($value->tag == "img") { $src = $this->unescape($value->src); $UrlArr = parse_url($src); if(!isset($UrlArr['host'])) { $src = $this->host.$value->src; $UrlArr = parse_url($src); } $src = $this->getImageFromNet($src,$UrlArr); //表示有网络图片,需要下载 if($src) { $imgsArr = $this->GD($src); $tmp = array("tag"=>$value->tag,"src"=>$src,"text"=>$value->alt,"width"=>$imgsArr['width'],"height"=>$imgsArr['height']); } } else { $tmp = array("tag"=>$value->tag,"text"=>strip_tags($value->innertext)); } $this->LinetextArr[] = $tmp; } } /** * 根据GD库来获取图片的如果太多,进行比例压缩 * **/ private function GD($src) { list($width, $height, $type, $attr) = getimagesize($src); if($width > 800 || $height > 800 ) { $width = $width/2; $height = $height/2; } return array("width"=>$width,"height"=>$height); } /** * 将Uincode编码转移回原来的字符 * **/ public function unescape($str) { $str = rawurldecode($str); preg_match_all("/(?:%u.{4})|.{4};|\d+;|.+/U",$str,$r); $ar = $r[0]; foreach($ar as $k=>$v) { if(substr($v,0,2) == "%u"){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,-4))); } elseif(substr($v,0,3) == ""){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,3,-1))); } elseif(substr($v,0,2) == ""){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("n",substr($v,2,-1))); } } return join("",$ar); } /** * 图片下载 * @param $Src 目标资源 * @param $UrlArr 目标URL对应的数组 * **/ private function getImageFromNet($Src,$UrlArr) { $file = basename($UrlArr['path']); $ext = explode('.',$file); $this->ImgDir = $this->CurrentDir."/".$UrlArr['host']; $_supportedImageTypes = array('jpg', 'jpeg', 'gif', 'png', 'bmp', 'tif', 'tiff'); if(isset($ext['1']) && in_array($ext['1'],$_supportedImageTypes)) { $file = file_get_contents($Src); $this->HttpRequestArr[] = $Src; $this->HttpRequestTime++; $this->_mkdir(); //创建目录,或者收集错误 $imgName = md5($UrlArr['path']).".".$ext['1']; file_put_contents($this->ImgDir."/".$imgName,$file); $this->DownImg++; return $UrlArr['host']."/".$imgName; } return false; } /** * 创建目录 * **/ private function _mkdir() { if(!is_dir($this->ImgDir)) { if(!mkdir($this->ImgDir,"7777")) { $this->error[] = error_get_last(); } } } /** * 构造WordDom * **/ private function CreateWordDom() { $PHPWord = new PHPWord(); $PHPWord->setDefaultFontName('宋体'); $PHPWord->setDefaultFontSize("11"); $styleTable = array('borderSize'=>6, 'borderColor'=>'006699', 'cellMargin'=>120); // New portrait section $section = $PHPWord->createSection(); $section->addText($this->Details(),array(),array('spacing'=>120)); //数据进行处理 foreach($this->LinetextArr as $key=>$lineArr) { if(isset($lineArr['tag'])) { if($lineArr['tag'] == "li") { $section->addListItem($lineArr['text'],0,"","",array('spacing'=>120)); } else if($lineArr['tag'] == "img") { $section->addImage($lineArr['src'],array('width'=>$lineArr['width'], 'height'=>$lineArr['height'], 'align'=>'center')); } else if($lineArr['tag'] == "p") { $section->addText($lineArr['text'],array(),array('spacing'=>120)); } } else if($key == "table") { $PHPWord->addTableStyle('myOwnTableStyle', $styleTable); $table = $section->addTable("myOwnTableStyle"); foreach($lineArr as $key=>$tr) { $table->addRow(); foreach($tr as $ky=>$td) { $table->addCell(2000)->addText($td['text']); } } } } $this->downFile($PHPWord); } public function Details() { $msg = "一共请求:{$this->HttpRequestTime}次,共下载的图片有{$this->DownImg}张,并且下载完成大约使用时间:{$this->expendTime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}KB,"; return $msg; } public function downFile($PHPWord) { if(empty($this->filename)) { $UrlArr = parse_url($this->url); $this->filename = $UrlArr['host'].".docx"; } // Save File $objWriter = PHPWord_IOFactory::createWriter($PHPWord, 'Word2007'); $objWriter->save($this->filename); header("Pragma: public"); header("Expires: 0"); header("Cache-Control: must-revalidate, post-check=0, pre-check=0"); header("Cache-Control: public"); header("Content-Description: File Transfer"); //Use the switch-generated Content-Type header('Content-type: application/msword');//输出的类型 //Force the download $header="Content-Disposition: attachment; filename=".$this->filename.";"; header($header); @readfile($this->filename); } }
The key point of the above code does not seem to be word generation, but the use of Simplehtmldom, which is an open source HTML parser. As mentioned before, I have been looking at his code these days, and
has led to two learning directions
① Expressing
② This extended function is organized
Reaping insights from the source code:
PHP exceptions can be caught, and PHP errors can also be caught .
error_get_last() //用这个函数可以捕获页面中的PHP错误,不谢。
Summary: The above is the entire content of this article, I hope it will be helpful to everyone's study.
Related recommendations:
php-fpm example of adding service service
php-Fpm service startup script method
PHP data type conversion (character to number, number to character)
The above is the detailed content of How to convert HTML page to word using php and save it. For more information, please follow other related articles on the PHP Chinese website!