Server IP : 15.235.198.142 / Your IP : 216.73.216.190 Web Server : Apache/2.4.58 (Ubuntu) System : Linux ballsack 6.8.0-45-generic #45-Ubuntu SMP PREEMPT_DYNAMIC Fri Aug 30 12:02:04 UTC 2024 x86_64 User : www-data ( 33) PHP Version : 8.3.6 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : OFF | Sudo : ON | Pkexec : OFF Directory : /var/www/yme/wp-content/plugins/simply-static/src/ |
Upload File : |
<?php namespace Simply_Static; use Exception; use voku\helper\HtmlDomParser; use function WPML\FP\apply; // Exit if accessed directly if ( ! defined( 'ABSPATH' ) ) { exit; } /** * Simply Static URL extractor class * * Note that in addition to extracting URLs this class also makes modifications * to the Simply_Static\Url_Response that is passed into it: URLs in the body of * the response are updated to be absolute URLs. */ class Url_Extractor { /** * The following pages were incredibly helpful: * - http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value * - http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page * - http://php.net/manual/en/book.dom.php */ protected static $match_tags = array( 'a' => array( 'href', 'urn', 'style' ), 'base' => array( 'href' ), 'img' => array( 'src', 'usemap', 'longdesc', 'dynsrc', 'lowsrc', 'srcset', 'data-src', 'data-srcset', 'data-bg' ), 'use' => array( 'href' ), 'picture' => array( 'src', 'srcset', 'data-src', 'data-srcset', 'data-bg' ), 'amp-img' => array( 'src', 'srcset' ), 'applet' => array( 'code', 'codebase', 'archive', 'object' ), 'area' => array( 'href' ), 'body' => array( 'background', 'credits', 'instructions', 'logo' ), 'input' => array( 'src', 'usemap', 'dynsrc', 'lowsrc', 'formaction' ), 'blockquote' => array( 'cite' ), 'del' => array( 'cite' ), 'frame' => array( 'longdesc', 'src' ), 'head' => array( 'profile' ), 'ins' => array( 'cite' ), 'object' => array( 'archive', 'classid', 'codebase', 'data', 'usemap' ), 'q' => array( 'cite' ), 'script' => array( 'src' ), 'audio' => array( 'src', 'srcset' ), 'figure' => array( 'src', 'srcset' ), 'command' => array( 'icon' ), 'embed' => array( 'src', 'code', 'pluginspage' ), 'event-source' => array( 'src' ), 'html' => array( 'manifest', 'background', 'xmlns' ), 'source' => array( 'src', 'srcset' ), 'video' => array( 'src', 'poster', 'srcset' ), 'image' => array( 'href', 'xlink:href', 'src', 'style', 'srcset' ), 'bgsound' => array( 'src' ), 'div' => array( 'href', 'src', 'style', 'data-bg', 'data-thumbnail' ), 'span' => array( 'href', 'src', 'style', 'data-bg' ), 'section' => array( 'style', 'data-bg' ), 'footer' => array( 'style' ), 'header' => array( 'style' ), 'ilayer' => array( 'src' ), 'table' => array( 'background' ), 'td' => array( 'background' ), 'th' => array( 'background' ), 'layer' => array( 'src' ), 'xml' => array( 'src' ), 'button' => array( 'formaction', 'style' ), 'datalist' => array( 'data' ), 'select' => array( 'data' ), 'access' => array( 'path' ), 'card' => array( 'onenterforward', 'onenterbackward', 'ontimer' ), 'go' => array( 'href' ), 'option' => array( 'onpick' ), 'template' => array( 'onenterforward', 'onenterbackward', 'ontimer' ), 'wml' => array( 'xmlns' ), 'meta' => array( 'content' ), 'link' => array( 'href' ), 'atom' => array( 'href' ) ); // /** @const */ // protected static $match_metas = array( // 'content-base', // 'content-location', // 'referer', // 'location', // 'refresh', // ); /** * The static page to extract URLs from * @var \Simply_Static\Page */ protected $static_page; /** * An instance of the options structure containing all options for this plugin * @var \Simply_Static\Options */ protected $options = null; /** * The url of the site * @var array */ public $extracted_urls = array(); /** * Constructor * * @param string $static_page Simply_Static\Page to extract URLs from */ public function __construct( $static_page ) { $this->static_page = $static_page; $this->options = Options::instance(); } /** * Fetch the content from our file * @return string */ public function get_body() { // Setting the stream context to prevent an issue where non-latin // characters get converted to html codes like #1234; inappropriately // http://stackoverflow.com/questions/5600371/file-get-contents-converts-utf-8-to-iso-8859-1 $opts = array( 'http' => array( 'header' => "Accept-Charset: UTF-8" ) ); $context = stream_context_create( $opts ); $path = $this->options->get_archive_dir() . $this->static_page->file_path; return file_get_contents( $path, false, $context ); } /** * Save a string back to our file (e.g. after having updated URLs) * * @param string $static_page Simply_Static\Page to extract URLs from * * @return int|false */ public function save_body( $content ) { $content = apply_filters( 'simply_static_content_before_save', $content, $this ); return file_put_contents( $this->options->get_archive_dir() . $this->static_page->file_path, $content ); } /** * Get the Static Page. * * @return \Simply_Static\Page|string */ public function get_static_page() { return $this->static_page; } /** * Extracts URLs from the static_page and update them based on the dest. type * * Returns a list of unique URLs from the body of the static_page. It only * extracts URLs from the same domain, either absolute urls or relative urls * that are then converted to absolute urls. * * Note that no validation is performed on whether the URLs would actually * return a 200/OK response. * * @return array */ public function extract_and_update_urls() { if ( $this->static_page->is_type( 'html' ) ) { $this->save_body( $this->extract_and_replace_urls_in_html() ); } if ( $this->static_page->is_type( 'css' ) ) { $this->save_body( $this->extract_and_replace_urls_in_css( $this->get_body() ) ); } if ( $this->static_page->is_type( 'xml' ) ) { $this->save_body( $this->extract_and_replace_urls_in_xml() ); } if ( $this->static_page->is_type( 'json' ) ) { $this->save_body( $this->extract_and_replace_urls_in_json() ); } if ( $this->static_page->is_type( 'html' ) || $this->static_page->is_type( 'css' ) || $this->static_page->is_type( 'xml' ) || $this->static_page->is_type( 'json' ) ) { // Replace encoded URLs. $this->replace_encoded_urls(); // If activated forced string/replace for URLs. if ( $this->options->get( 'force_replace_url' ) && ( ! $this->options->get( 'use_forms' ) && ! $this->options->get( 'use_comments' ) ) ) { $this->force_replace_urls(); } } return array_unique( $this->extracted_urls ); } /** * Replaces origin URL with destination URL in response body * * This is a function of last resort for URL replacement. Ideally it was * already done in one of the extract_and_replace_urls_in_x functions. * * This catches instances of WordPress URLs and replaces them with the * destinaton_url. This generally works fine for absolute and relative URL * generation. It'll produce sub-optimal results for offline URLs, in that * it's only replacing the host and not adjusting the path according to the * current page. The point of this is more to remove any traces of the * WordPress URL than anything else. * * @return void */ public function replace_encoded_urls() { $destination_url = $this->options->get_destination_url(); $response_body = $this->get_body(); // replace wp_json_encode'd urls, as used by WP's `concatemoji` $response_body = str_replace( addcslashes( Util::origin_url(), '/' ), addcslashes( $destination_url, '/' ), $response_body ); // replace encoded URLs, as found in query params $response_body = preg_replace( '/(https?%3A)?%2F%2F' . addcslashes( urlencode( Util::origin_host() ), '.' ) . '/i', urlencode( $destination_url ), $response_body ); $this->save_body( $response_body ); } /** * Replaces origin URL with destination URL in response body * * This is a function of last resort for URL replacement. Ideally it was * already done in one of the extract_and_replace_urls_in_x functions. * * This catches instances of WordPress URLs and replaces them with the * destinaton_url. This generally works fine for absolute and relative URL * generation. It'll produce sub-optimal results for offline URLs, in that * it's only replacing the host and not adjusting the path according to the * current page. The point of this is more to remove any traces of the * WordPress URL than anything else. * * @return void */ public function force_replace_urls() { /* TODO: Can we get it to work with offline URLs via preg_replace_callback + convert_url? To do that we'd need to grab the entire URL. Ideally that would also work with escaped URLs / inside of JavaScript. And even more ideally, we'd only have a single preg_replace. */ $destination_url = $this->options->get_destination_url(); $response_body = $this->get_body(); // replace any instance of the origin url, whether it starts with https://, http://, or //. $response_body = preg_replace( '/(https?:)?\/\/' . addcslashes( Util::origin_host(), '/' ) . '/i', $destination_url, $response_body ); // replace wp_json_encode'd urls, as used by WP's `concatemoji`. // e.g. {"concatemoji":"http:\/\/www.example.org\/wp-includes\/js\/wp-emoji-release.min.js?ver=4.6.1"}. $response_body = str_replace( addcslashes( Util::origin_url(), '/' ), addcslashes( $destination_url, '/' ), $response_body ); $response_body = apply_filters( 'simply_static_force_replaced_urls_body', $response_body, $this->static_page ); $this->save_body( $response_body ); } /** * Extract URLs and convert URLs to absolute URLs for each tag * * The tag is passed by reference, so it's updated directly and nothing is * returned from this function. * * @param simple_html_dom_node $tag SHDP dom node * @param string $tag_name name of the tag * @param array $attributes array of attribute notes * * @return void */ private function extract_urls_and_update_tag( &$tag, $tag_name, $attributes ) { if ( isset( $tag->style ) ) { $updated_css = $this->extract_and_replace_urls_in_css( $tag->style ); $tag->style = $updated_css; } foreach ( $attributes as $attribute_name ) { if ( isset( $tag->$attribute_name ) ) { $extracted_urls = array(); $attribute_value = $tag->$attribute_name; // we need to verify that the meta tag is a URL. if ( 'meta' === $tag_name ) { if ( filter_var( $attribute_value, FILTER_VALIDATE_URL ) ) { $extracted_urls[] = $attribute_value; } } else { // srcset is a fair bit different from most html if ( $attribute_name === 'srcset' || $attribute_name === 'data-srcset' ) { $extracted_urls = $this->extract_urls_from_srcset( $attribute_value ); } else { $extracted_urls[] = $attribute_value; } } $strict_url_validation = apply_filters( 'simply_static_strict_url_validation', false ); foreach ( $extracted_urls as $extracted_url ) { if ( $strict_url_validation && ! filter_var( $extracted_url, FILTER_VALIDATE_URL ) ) { continue; } if ( $extracted_url !== '' ) { $updated_extracted_url = $this->add_to_extracted_urls( $extracted_url ); if ( ! is_null( $updated_extracted_url ) ) { $attribute_value = str_replace( $extracted_url, $updated_extracted_url, $attribute_value ); } } } $tag->$attribute_name = $attribute_value; } } } /** * Loop through elements of interest in the DOM to pull out URLs * * There are specific html tags and -- more precisely -- attributes that * we're looking for. We loop through tags with attributes we care about, * which the attributes for URLs, extract and update any URLs we find, and * then return the updated HTML. * @return string The HTML with all URLs made absolute */ private function extract_and_replace_urls_in_html() { $html_string = $this->get_body(); $match_tags = apply_filters( 'ss_match_tags', self::$match_tags ); $dom = HtmlDomParser::str_get_html( $html_string ); // return the original html string if dom is blank or boolean (unparseable) if ( $dom == '' || is_bool( $dom ) ) { return $html_string; } else { // handle tags with attributes foreach ( $match_tags as $tag_name => $attributes ) { $tags = $dom->find( $tag_name ); foreach ( $tags as $tag ) { $this->extract_urls_and_update_tag( $tag, $tag_name, $attributes ); } } // handle 'style' tag differently, since we need to parse the content. $parse_inline_style = apply_filters( 'ss_parse_inline_style', true ); if ( $parse_inline_style ) { $style_tags = $dom->find( 'style' ); foreach ( $style_tags as $tag ) { // Check if valid content exists. try { $updated_css = $this->extract_and_replace_urls_in_css( $tag->innerhtmlKeep ); $tag->innerhtmlKeep = $updated_css; } catch ( Exception $e ) { // If not skip the result. continue; } } } // handle 'script' tag differently, since we need to parse the content. $parse_inline_script = apply_filters( 'ss_parse_inline_script', true ); if ( $parse_inline_script ) { $script_tags = $dom->find( 'script' ); foreach ( $script_tags as $tag ) { // Check if valid content exists. try { $updated_script = $this->extract_and_replace_urls_in_script( $tag->innerhtmlKeep ); $tag->innerhtmlKeep = $updated_script; $this->extract_and_replace_urls_in_script_inner_text( $tag ); } catch ( Exception $e ) { // If not skip the result. continue; } } } do_action( 'ss_after_extract_and_replace_urls_in_html', $dom, $this ); // Further manipulate Dom? $dom = apply_filters( 'ss_dom_before_save', $dom, $this->static_page->url ); return $dom->save(); } } /** * Extract URLs from the srcset attribute * * @param string $srcset Value of the srcset attribute * * @return array Array of extracted URLs */ private function extract_urls_from_srcset( $srcset ) { $extracted_urls = array(); foreach ( explode( ',', $srcset ) as $url_and_descriptor ) { // remove the (optional) descriptor // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#attr-srcset $extracted_urls[] = trim( preg_replace( '/[\d\.]+[xw]\s*$/', '', $url_and_descriptor ) ); } return $extracted_urls; } /** * Use regex to extract URLs on CSS pages * * URLs in CSS follow three basic patterns: * - @import "common.css" screen, projection; * - @import url("fineprint.css") print; * - background-image: url(image.png); * * URLs are either contained within url(), part of an @import statement, * or both. * * @param string $text The CSS to extract URLs from * * @return string The CSS with all URLs converted */ private function extract_and_replace_urls_in_css( $text ) { $text = html_entity_decode( $text ); $patterns = array( "/url\(\s*[\"']?([^)\"']+)/", // url() "/@import\s+[\"']([^\"']+)/" ); // @import w/o url() foreach ( $patterns as $pattern ) { $text = preg_replace_callback( $pattern, array( $this, 'css_matches' ), $text ); } return $text; } private function extract_and_replace_urls_in_script( $text ) { if ( $this->is_json( $text ) ) { $decoded_text = html_entity_decode( $text, ENT_NOQUOTES ); } else { $decoded_text = html_entity_decode( $text ); } $decoded_text = apply_filters( 'simply_static_decoded_urls_in_script', $decoded_text, $this->static_page, $this ); $text = preg_replace( '/(https?:)?\/\/' . addcslashes( Util::origin_host(), '/' ) . '/i', $this->options->get_destination_url(), $decoded_text ); return $text; } /** * @param \ $tag * * @return array|string|string[]|null */ private function extract_and_replace_urls_in_script_inner_text( $tag ) { $regex = '/(https?:)?\/\/' . addcslashes( Util::origin_host(), '/' ) . '/i'; switch ( $this->options->get( 'destination_url_type' ) ) { case 'absolute': $convert_to = $this->options->get_destination_url(); break; case 'relative': // Adding \/? before end of regex pattern to convert url.com/ & url.com to relative path, ex. /path/. $regex = '/(https?:)?\/\/' . addcslashes( Util::origin_host(), '/' ) . '\/?/i'; $convert_to = $this->options->get( 'relative_path' ); break; default: // Offline mode. // Adding \/? before end of regex pattern to convert url.com/ & url.com to relative path, ex. /path/. $regex = '/(https?:)?\/\/' . addcslashes( Util::origin_host(), '/' ) . '\/?/i'; $convert_to = '/'; } if ( $this->is_json( $tag->innerhtmlKeep ) ) { $decoded_text = html_entity_decode( $tag->innerhtmlKeep, ENT_NOQUOTES ); } else { $decoded_text = html_entity_decode( $tag->innerhtmlKeep ); } $decoded_text = apply_filters( 'simply_static_decoded_text_in_script', $decoded_text, $this->static_page, $convert_to, $tag, $this ); $tag->innerhtmlKeep = preg_replace( $regex, $convert_to, $decoded_text ); return $tag; } /** * Check whether a given string is a valid JSON representation. * * Copied from: WP CLI, https://github.com/wp-cli/wp-cli/blob/f3e4b0785aa3d3132ee73be30aedca8838a8fa06/php/utils.php#L1600-L1612 * * @param string $argument String to evaluate. * @param bool $ignore_scalars Optional. Whether to ignore scalar values. * Defaults to true. * * @return bool Whether the provided string is a valid JSON representation. */ protected function is_json( $argument, $ignore_scalars = true ) { if ( ! is_string( $argument ) || '' === $argument ) { return false; } $arg = $argument[0]; if ( $ignore_scalars && ! in_array( $argument[0], [ '{', '[' ], true ) ) { return false; } json_decode( $argument, $assoc = true ); return json_last_error() === JSON_ERROR_NONE; } /** * callback function for preg_replace in extract_and_replace_urls_in_css * * Takes the match, extracts the URL, adds it to the list of URLs, converts * the URL to a destination URL. * * @param array $matches Array of preg_replace matches * * @return string An updated string for the text that was originally matched */ public function css_matches( $matches ) { $full_match = $matches[0]; $extracted_url = $matches[1]; if ( isset( $extracted_url ) && $extracted_url !== '' ) { $updated_extracted_url = $this->add_to_extracted_urls( $extracted_url ); $full_match = str_ireplace( $extracted_url, $updated_extracted_url, $full_match ); } return $full_match; } /** * Use regex to extract URLs from XML docs (e.g. /feed/) * @return string The XML with all of the URLs converted */ private function extract_and_replace_urls_in_xml() { $xml_string = $this->get_body(); // match anything starting with http/s plus all following characters // except: [space] " ' < $pattern = "/https?:\/\/[^\s\"'<]+/"; $text = preg_replace_callback( $pattern, array( $this, 'xml_matches' ), $xml_string ); return $text; } /** * Use regex to extract URLs from JSON files (e.g. /feed/) * @return string The JSON with all of the URLs converted */ private function extract_and_replace_urls_in_json() { $json_string = $this->get_body(); // match anything starting with http/s plus all following characters // except: [space] " ' < $pattern = "/https?:\/\/[^\s\"'<]+/"; $text = preg_replace_callback( $pattern, array( $this, 'json_matches' ), $json_string ); return $text; } /** * Callback function for preg_replace in extract_and_replace_urls_in_xml * * Takes the match, adds it to the list of URLs, converts the URL to a * destination URL. * * @param array $matches Array of regex matches found in the XML doc * * @return string The extracted, converted URL */ private function xml_matches( $matches ) { $extracted_url = $matches[0]; if ( isset( $extracted_url ) && $extracted_url !== '' ) { $updated_extracted_url = $this->add_to_extracted_urls( $extracted_url ); } return $updated_extracted_url; } /** * Callback function for preg_replace in extract_and_replace_urls_in_json * * Takes the match, adds it to the list of URLs, converts the URL to a * destination URL. * * @param array $matches Array of regex matches found in the JSON file * * @return string The extracted, converted URL */ private function json_matches( $matches ) { $extracted_url = $matches[0]; if ( isset( $extracted_url ) && $extracted_url !== '' ) { $updated_extracted_url = $this->add_to_extracted_urls( $extracted_url ); } return $updated_extracted_url; } /** * Add a URL to the extracted URLs array and convert to absolute/relative/offline * * URLs are first converted to absolute URLs. Then they're checked to see if * they are local URLs; if they are, they're added to the extracted URLs * queue. * * If the destination URL type requested was absolute, the WordPress scheme/ * host is swapped for the destination scheme/host. If the destination URL * type is relative/offline, the URL is converted to that format. Then the * URL is returned. * * @return string The URL that should be added to the list of extracted URLs * @return string The URL, converted to an absolute/relative/offline URL */ public function add_to_extracted_urls( $extracted_url ) { $url = Util::relative_to_absolute_url( $extracted_url, $this->static_page->url ); if ( $url && Util::is_local_url( $url ) ) { // add to extracted urls queue $this->extracted_urls[] = apply_filters( 'simply_static_extracted_url', Util::remove_params_and_fragment( $url ), $url, $this->static_page ); $url = $this->convert_url( $url ); } return $url; } /** * Convert URL to absolute URL at desired host or to a relative or offline URL * * @param string $url Absolute URL to convert * * @return string Converted URL */ private function convert_url( $url ) { $url = apply_filters( 'simply_static_pre_converted_url', $url, $this->static_page, $this ); if ( $this->options->get( 'destination_url_type' ) == 'absolute' ) { $url = $this->convert_absolute_url( $url ); } else if ( $this->options->get( 'destination_url_type' ) == 'relative' ) { $url = $this->convert_relative_url( $url ); } else if ( $this->options->get( 'destination_url_type' ) == 'offline' ) { $url = $this->convert_offline_url( $url ); } $url = remove_query_arg( 'simply_static_page', $url ); return apply_filters( 'simply_static_converted_url', $url, $this->static_page, $this ); } /** * Convert a WordPress URL to a URL at the destination scheme/host * * @param string $url Absolute URL to convert * * @return string URL at destination scheme/host */ private function convert_absolute_url( $url ) { $destination_url = $this->options->get_destination_url(); $url = Util::strip_protocol_from_url( $url ); $url = str_replace( Util::origin_host(), $destination_url, $url ); return $url; } /** * Convert a WordPress URL to a relative path * * @param string $url Absolute URL to convert * * @return string Relative path for the URL */ private function convert_relative_url( $url ) { $url = Util::get_path_from_local_url( $url ); $url = $this->options->get( 'relative_path' ) . $url; return $url; } /** * Convert a WordPress URL to a path for offline usage * * This function compares current page's URL to the provided URL and * creates a path for getting from one page to the other. It also attaches * /index.html onto the end of any path that isn't a file, before any * fragments or params. * * Example: * static_page->url: http://static-site.dev/2013/01/11/page-a/ * $url: http://static-site.dev/2013/01/10/page-b/ * path: ./../../10/page-b/index.html * * @param string $url Absolute URL to convert * * @return string Converted path */ private function convert_offline_url( $url ) { // remove the scheme/host from the url $page_path = Util::get_path_from_local_url( $this->static_page->url ); $extracted_path = Util::get_path_from_local_url( $url ); // create a path from one page to the other $path = Util::create_offline_path( $extracted_path, $page_path ); $path_info = Util::url_path_info( $url ); if ( $path_info['extension'] === '' ) { // If there's no extension, we need to add a /index.html, // and do so before any params or fragments. $clean_path = Util::remove_params_and_fragment( $path ); $fragment = substr( $path, strlen( $clean_path ) ); $path = trailingslashit( $clean_path ); $path .= 'index.html' . $fragment; } return $path; } }