ljsdev-ssg

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 771aa648cf96d6718819ef81c9ee923e10897eec
parent 8cc3df34b5f14435671ad9922a5af4d4fe46bacc
Author: Leon <leon@wp2static.com>
Date:   Mon, 19 Aug 2019 07:30:43 +0200

extract content from WP export

Diffstat:
M.gitignore | 2++
Aextract-title-body-date.php | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1 +1,3 @@ dst/ +*swp* + diff --git a/extract-title-body-date.php b/extract-title-body-date.php @@ -0,0 +1,78 @@ +<?php + +# parse WordPress + Elementor generated HTML & return cleaned body content only +function DOMinnerHTML(DOMNode $element) { + $innerHTML = ""; + $children = $element->childNodes; + + foreach ($children as $child) + { + $innerHTML .= $element->ownerDocument->saveHTML($child); + } + + return $innerHTML; +} + +# load input into DOMDocument +$xml_doc = new DOMDocument(); + +$file = file_get_contents( $argv[1] ); +// prevent warnings, via https://stackoverflow.com/a/9149241/1668057 +libxml_use_internal_errors( true ); +$xml_doc->loadHTML( $file ); +libxml_use_internal_errors( false ); + +$title = ""; +$stitle = ""; +$body = ""; + +# get title (blogs) +$h1 = $xml_doc->getElementsByTagName( 'h1' ); +if (isset($h1[0])) { + $title = "<h1>{$h1[0]->nodeValue}</h1>"; + echo $title . PHP_EOL; + + # get body (blogs) + $finder = new DomXPath($xml_doc); + $bodyel = $finder->query("/html/body/div/div/div/section[3]/div/div/div/div/div/div/div"); + + if (isset($bodyel[0])) { + $body = DOMinnerHTML($bodyel[0]); + echo $body . PHP_EOL; + } +} else { + # get title (essays) + $h2 = $xml_doc->getElementsByTagName( 'h2' ); + if (isset($h2[1])) { + $title = "<h1>{$h2[1]->nodeValue}</h1>"; + echo $title . PHP_EOL; + } + + # get subtitle (essays) + $finder = new DomXPath($xml_doc); + $subtitle = $finder->query("/html/body/div/div/div/section[2]/div/div/div/div/div/div[2]/div/div/p"); + + if (isset($subtitle[0])) { + $stitle = "<h2>{$subtitle[0]->nodeValue}</h2>"; + echo $stitle . PHP_EOL; + } + + # get body (blogs) + $finder = new DomXPath($xml_doc); + $bodyel = $finder->query("/html/body/div/div/div/section[3]/div/div/div/div/div/div/div"); + + if (isset($bodyel[3])) { + $body = DOMinnerHTML($bodyel[3]); + echo $body PHP_EOL; + } +} + + +# get post-body +# get date + +# form new body content +# strip all classes and extra attributes + + +# output cleaned content