<?php
/*******************************************************************
* Glype is copyright and trademark 2007-2013 UpsideOut, Inc. d/b/a Glype
* and/or its licensors, successors and assigners. All rights reserved.
*
* Use of Glype is subject to the terms of the Software License Agreement.
* http://www.glype.com/license.php
*******************************************************************
* This is the parser for the proxy - changes the original 'raw'
* document so that everything (images, links, etc.) is rerouted to
* be downloaded via the proxy script instead of directly.
******************************************************************/
class parser {
# State of javascript parser - null for parse everything, false
# for parse all non-standard overrides, or (array) with specifics
private $jsFlagState;
# Browsing options (Remove Scripts, etc.)
private $htmlOptions;
# Constructor accepts options and saves them in the object
function __construct($htmlOptions, $jsFlags) {
$this->jsFlagState = $jsFlags;
$this->htmlOptions = $htmlOptions;
}
/*****************************************************************
* HTML parsers - main parsing function splits up document into
* component parts ('normal' HTML, scripts and styles)
******************************************************************/
function HTMLDocument($input, $insert='', $inject=false, $footer='') {
if (strlen($input)>65536) {
if (version_compare(PHP_VERSION, '5.3.7')<=0) {
ini_set('pcre.backtrack_limit', 1000000);
}
}
#
# Apply parsing that only needs to be done once..
#
# Record the charset
global $charset;
if (!isset($charset)) {
$meta_equiv = preg_match('#(<meta[^>]*http\-equiv=["\']?Content\-Type[^>]*>)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[0][0] : null;
if (isset($meta_equiv)) {
$charset = preg_match('#charset\s*=\s*([^"\'\s>]*)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null;
}
}
# Remove empty script comments
$input = preg_replace('#/\*\s*\*/#s', '', $input);
# Remove conditional comments
$input = preg_replace('#<\!\-\-\[if \!IE\]>\s*\-\->(.*?)<\!\[endif\]\-\->#s','$1',$input);
$input = preg_replace('#<\!\-\-\[if.*?<\!\[endif\]\-\->#s','',$input);
# Prevent websites from calling disableOverride()
$input = preg_replace('#disableOverride#s', 'disabled___disableOverride', $input);
# Remove titles if option is enabled
if ( $this->htmlOptions['stripTitle'] || $this->htmlOptions['encodePage'] ) {
$input = preg_replace('#<title.*?</title>#is', '', $input, 1);
$input = preg_replace('#<meta[^>]*name=["\'](title|description|keywords)["\'][^>]*>#is', '', $input, 3);
$input = preg_replace('#<link[^>]*rel=["\'](icon|shortcut icon)["\'][^>]*>#is', '', $input, 2);
}
# Remove and record a <base> href
$input = preg_replace_callback('#<base href\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1);
# Proxy url= values in meta redirects
$input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1);
# Process forms
$input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input);
# Remove scripts blocks (avoids individual processing below)
if ( $this->htmlOptions['stripJS'] ) {
$input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input);
}
#
# Split up the document into its different types and parse them
#
# Build up new document into this var
$new = '';
$offset = 0;
# Find instances of script or style blocks
while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) {
# What type of block is this?
$block = strtolower($match[1][0]);
# Start position of content
$outerStart = $match[0][1];
$innerStart = $outerStart + strlen($match[0][0]);
# Determine type of end tag and find it's position
$endTag = "</$block>";
$innerEnd = stripos($input, $endTag, $innerStart);
if ($innerEnd===false) {
$endTag = "</";
$innerEnd = stripos($input, $endTag, $innerStart);
if ($innerEnd===false) {
$input = preg_replace('#<script[^>]*>.*?$#is', '', $input);
break;
}
}
$outerEnd = $innerEnd + strlen($endTag);
# Parse everything up till here and add to the new document
$new .= $this->HTML(substr($input, $offset, $innerStart - $offset));
# Find parsing function
$parseFunction = $block == 'style' ? 'CSS' : 'JS' ;
# Add the parsed block
$new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart));
# Move offset to new position
$offset = $innerEnd;
}
# And add the final chunk (between last script/style block and end of doc)
$new .= $this->HTML(substr($input, $offset));
# Replace input with the updated document
$input = $new;
global $foundPlugin;
if ( $foundPlugin && function_exists('postParse') ) {
$input = postParse($input, 'html');
$foundPlugin=false;
}
# Make URLs relative
$input = preg_replace('#=\s*(["\'])?\s*https?://[^"\'>/]*/#i', '=$1/', $input);
# Encode the page
if ( $this->htmlOptions['encodePage'] ) {
$input = encodePage($input);
}
#
# Now add our own code bits
#
# Insert our mini form after the <body>
if ( $insert !== false ) {
# Check for a frameset
if ( ( $useFrames = stripos($input, '<frameset') ) !== false ) {
# Flag the frames so only first displays mini-form
$input = preg_replace_callback('#<frame[^>]+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_flagFrames', $input);
}
# Attempt to add after body
$input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp);
# Check it inserted and append (if not a frameset)
if ( ! $tmp && ! $useFrames ) {
$input = $insert . $input;
}
}
# Insert our javascript library
if ( $inject ) {
# Generate javascript to insert
$inject = injectionJS();
# Add our proxy javascript after <head>
$input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp);
# If no <head>, just prepend
if ( ! $tmp ) {
$input = $inject . $input;
}
}
# Add anything to the footer?
if ( $footer ) {
$input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp);
# If no </body>, just append the footer
if ( ! $tmp ){
$input .= $footer;
}
}
# Return new document
return $input;
}
# Parse HTML sections
function HTML($input) {
# Removing objects? Follow spec and display inner content of object tags instead.
if ( $this->htmlOptions['stripObjects'] ) {
# Remove all object tags (including those deprecated but still common)
$input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp);
# Found any? Remove the corresponding end tags
if ( $tmp ) {
$input = preg_replace('#</(?>object|applet|param|embed)>#i', '', $input, $tmp);
}
} else {
# Parse <param name="movie" value="URL"> tags
$input = preg_replace_callback('#<param[^>]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input);
# To do: proxy object related URLs
}
# Show content within <noscript> tags
# (preg_ seems to be faster than 2 str_ireplace() calls)
if ( $this->htmlOptions['stripJS'] ) {
$input = preg_replace('#</?noscript>#i', '', $input);
}
# Parse onX events
$input = preg_replace_callback('#\b(on(?<!\.on)[a-z]{2,20})\s*=\s*([\\\'"])?((?(2)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(2)\\2|)#i', array(&$this, 'html_eventJS'), $input);
# Parse style attributes
$input = preg_replace_callback('#style\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', array(&$this, 'html_elementCSS'), $input);
# Proxy URL attributes - this is the bottleneck but optimized as much as possible
$input = preg_replace_callback('#(?><[A-Z0-9]{1,15})(?>\s+[^>\s]+)*?\s*(?>(href|src|background|poster)\s*=(?!\\\\)\s*)(?>([\\\'"])?)((?(2)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^ >]{1,2048}))(?(2)\\2|)#i', 'html_attribute', $input);
# Return changed input
return $input;
}
# Proxy an onX javascript event
function html_eventJS($input) {
return $this->htmlOptions['stripJS'] ? '' : $input[1] . '=' . $input[2] . $this->JS($input[3]) . $input[2];
}
# Proxy a style="CSS" attribute
function html_elementCSS($input) {
return 'style=' . $input[1] . $this->CSS($input[2]) . $input[1];
}
/*****************************************************************
* CSS parser - main parsing function
* CSS parsing is a complicated by the caching of CSS files. We need
* to consider (A) cross-domain caching and (B) the unique URLs option.
* A) If possible, use a relative URL so the saved URLs do not explictly
* point to a single domain.
* B) There is a second set of callback functions with "_unique" suffixed
* and these return the original URL to be reparesed.
******************************************************************/
# The URLs depend on the unique and path info settings. The type parameter allows
# us to specify the unique callbacks.
function CSS($input, $storeUnique=false) {
# What type of parsing is this? Normally we parse any URLs to redirect
# back through the proxy but not when storing a cache with unique URLs.
$type = $storeUnique ? '_unique' : '';
# CSS needs proxying the calls to url(), @import and src=''
$input = preg_replace_callback('#\burl\s*\(\s*[\\\'"]?([^\\\'"\)]+)[\\\'"]?\s*\)#i', 'css_URL' . $type, $input);
$input = preg_replace_callback('#@import\s*[\\\'"]([^\\\'"\(\)]+)[\\\'"]#i', 'css_import' . $type, $input);
$input = preg_replace_callback('#\bsrc\s*=\s*([\\\'"])?([^)\\\'"]+)(?(1)\\1|)#i', 'css_src' . $type, $input);
# Make URLs relative
$input = preg_replace('#https?://[^"\'>/]*/#i', '/', $input);
# Return changed
return $input;
}
/*****************************************************************
* Javascript parser - main parsing function
*
* The specific parts that need proxying depends on which javascript
* functions we've been able to override. On first page load, the browser
* capabilities are tested to see what we can do client-side and the results
* sent back to us. This allows us to parse only what we have to.
* If $CONFIG['override_javascript'] is disabled, all commands are parsed
* server-side. This will use much more CPU!
*
* Commands to proxy only if no override at all:
* document.write()
* document.writeln()
* window.open()
* eval()
*
* Commands to proxy, regardless of browser capabilities:
* location.replace()
* .innerHTML=
*
* Commands to proxy if the extra "watch" flag is set
* (the browser doesn't support the .watch() method):
* location=
* x.location=
* location.href=
*
* Commands to proxy if the extra "setters" flag is set
* (the browser doesn't support the __defineSetter__() method):
* .src=
* .href=
* .background=
* .action=
*
* Commands to proxy if the extra "ajax" flag is set
* (the browser failed to override the .open() method):
* XMLHttpRequest.open()
******************************************************************/
function JS($input) {
# Stripping?
if ( $this->htmlOptions['stripJS'] ) {
return '';
}
# Get our flags
$flags = $this->jsFlagState;
# Unless we know we don't need to, apply all the browser-specific flags
if ( ! is_array($this->jsFlagState) ) {
$flags = array('ajax', 'watch', 'setters');
}
# If override is disabled, add a "base" flag
if ( $this->jsFlagState === null ) {
$flags[] = 'base';
}
# Start parsing!
$search = array();
# Create shortcuts to various search patterns:
# "before" - matches preceeding character (string of single char) [ignoring whitespace]
# "after" - matches next character (string of single char) [ignoring whitespace]
# "id" - key for identifying the original match (e.g. if we have >1 of the same key)
$assignmentPattern = array('before' => '.', 'after' => '=');
$methodPattern = array('before' => '.', 'after' => '(');
$functionPattern = array('after' => '(');
# Configure strings to search for, starting with always replaced commands
$search['innerHTML'][] = $assignmentPattern;
$search['location'][] = array('after' => '.', 'id' => 'replace()');
# ^ This is only for location.replace() - other forms are handled later
# Look for attribute assignments
if ( in_array('setters', $flags) ) {
$search['src'][] = $assignmentPattern;
$search['href'][] = $assignmentPattern;
$search['action'][] = $assignmentPattern;
$search['background'][] = $assignmentPattern;
$search['poster'][] = $assignmentPattern;
}
# Look for location changes
# location.href will be handled above, location= is handled here
if ( in_array('watch', $flags) ) {
$search['location'][] = array('after' => '=', 'id' => 'assignment');
}
# Look for .open() if either AJAX (XMLHttpRequest.open) or
# base (window.open) flags are present
if ( in_array('ajax', $flags) || in_array('base', $flags) ) {
$search['open'][] = $methodPattern;
}
# Add the basic code if no override
if ( in_array('base', $flags) ) {
$search['eval'][] = $functionPattern;
$search['writeln'][] = $methodPattern;
$search['write'][] = $methodPattern;
}
# Set up starting parameters
$offset = 0;
$length = strlen($input);
$searchStrings = array_keys($search);
while ( $offset < $length ) {
# Start off by assuming no more items (i.e. the next position
# of interest is the end of the document)
$commandPos = $length;
# Loop through the search subjects
foreach ( $searchStrings as $item ) {
# Any more instances of this?
if ( ( $tmp = strpos($input, $item, $offset) ) === false ) {
# Nope, skip to next item
continue;
}
# If $item is whole word?
if ( ( $input[$tmp-1] == '_' ) || ctype_alpha($input[$tmp-1]) ) {
# No
continue;
}
# Closer to the currently held 'next' position?
if ( $tmp < $commandPos ) {
$commandPos = $tmp;
$command = $item;
}
}
# No matches found? Finish parsing.
if ( $commandPos == $length ) {
break;
}
# We've found the main point of interest; now use the
# search parameters to check the surrounding chars to validate
# the match.
$valid = false;
foreach ( $search[$command] as $pattern ) {
# Check the preceeding chars
if ( isset($pattern['before']) && str_checkprev($input, $pattern['before'], $commandPos-1) === false ) {
continue;
}
# Check next chars
if ( isset($pattern['after']) && ( $charPos = str_checknext($input, $pattern['after'], $commandPos + strlen($command), false, false) ) === false ) {
continue;
}
$postCharPos = ($charPos + 1) + strspn($input, " \t\r\n", $charPos + 1);
# Still here? Match must be OK so generate a match ID
if ( isset($pattern['id']) ) {
$valid = $command . $pattern['id'];
} else {
$valid = $command;
}
break;
}
# What we do next depends on which match (if any) we've found...
switch ( $valid ) {
# Assigment
case 'src':
case 'href':
case 'background':
case 'poster':
case 'action':
case 'locationassignment':
case 'innerHTML':
# Check our post-char position for = as well (could be equality
# test rather than assignment, i.e. == )
if ( ! isset($input[$postCharPos]) || $input[$postCharPos] == '=' ) {
break;
}
# Find the end of this statement
$endPos = analyzeAssign_js($input, $charPos);
$valueLength = $endPos - $postCharPos;
# Produce replacement command
$replacement = sprintf('parse%s(%s)', $command=='innerHTML' ? 'HTML' : 'URL', substr($input, $postCharPos, $valueLength));
# Adjust total document length as appropriate
$length += strlen($replacement);
# Make the replacement
$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
# Move offset up to new position
$offset = $endPos + 10;
# Go get next match
continue 2;
# Function calls - we don't know for certain if these are in fact members of the
# appropriate objects (window/XMLHttpRequest for .open(), document for .write() and
# .writeln) so we won't change anything. Main.js still overrides these functions but
# does nothing with them by default. We add an extra parameter to tell our override
# to kick in.
case 'open':
case 'write':
case 'writeln':
# Find the end position (the closing ")" for the function call)
$endPos = analyze_js($input, $charPos);
# Insert our additional argument just before that
$glStr=',"gl"';
if (strspn($input, ";\n\r\+{}()[]", $charPos) >= ($endPos - $charPos)) {
$glStr='"gl"';
}
$input = substr_replace($input, $glStr, $endPos - 1, 0);
# Adjust the document length
$length += strlen($glStr);
# And move the offset
$offset = $endPos + strlen($glStr);
# Get next match
continue 2;
# Eval() is a just as easy since we can just wrap the entire thing in parseJS().
case 'eval':
# Ensure this is a call to eval(), not anotherfunctionendingineval()
if ( isset($input[$commandPos-1]) && strpos('abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', $input[$commandPos-1]) !== false ) {
break;
}
# Find the end position (the closing ")" for the function call)
$endPos = analyze_js($input, $charPos);
$valueLength = $endPos - $postCharPos;
# Generate our replacement
$replacement = sprintf('parseJS(%s)', substr($input, $postCharPos, $valueLength));
# Make the replacement
$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
# Adjust the document length
$length += 9;
# And move the offset
$offset = $endPos + 9;
continue 2;
# location.replace() is a tricky one. We have the position of the char
# after . as $postCharPos and need to ensure we're calling replace(),
# then parse the entire URL
case 'locationreplace()':
# Validate the match
if ( ! preg_match('#\Greplace\s*\(#', $input, $tmp, 0, $postCharPos) ) {
break;
}
# Move $postCharPos to inside the brackets of .replace()
$charPos = $postCharPos - 1;
# Find the end position (the closing ")" for the function call)
$endPos = analyze_js($input, $charPos);
$valueLength = $endPos - $postCharPos;
# Generate our replacement
$replacement = sprintf('parseURL(%s)', substr($input, $postCharPos, $valueLength));
# Make the replacement
$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
# Adjust the document length
$length += 9;
# And move the offset
$offset = $endPos + 9;
continue 2;
}
# Still here? A match didn't validate so adjust offset to just after
# current position
$offset = $commandPos + 1;
}
# Ignore document.domain
$input = str_replace('document.domain', 'ignore', $input);
# Return changed
return $input;
}
}
/*****************************************************************
* HTML callbacks
******************************************************************/
# Remove and record the <base> href
function html_stripBase($input) {
global $base;
$base = $input[2];
return '';
}
# Proxy the location of a meta refresh
function html_metaRefresh($input) {
return str_replace($input[3], proxyURL($input[3]), $input[0]);
}
# Proxy URL in <param name="movie" value="URL">
function html_paramValue($input) {
# Check for a name="movie" tag
if ( stripos($input[0], 'movie') === false ) {
return $input[0];
}
return str_replace($input[2], proxyURL($input[2]), $input[0]);
}
# Process forms - the query string is used by the proxy script
# and GET data needs to be encoded anyway. We convert all GET
# forms to POST and then the proxy script will forward it properly.
function html_form($input) {
# Check for a given method
if ( preg_match('#\bmethod\s*=\s*["\\\']?(get|post)["\\\']?#i', $input[1], $tmp) ) {
# Not POST?
if ( strtolower($tmp[1]) != 'post' ) {
# Convert to post and flag as a conversion
$input[1] = str_replace($tmp[0], 'method="post"', $input[1]);
$converted = true;
}
} else {
# Append a POST method (no method given and GET is default)
$input[1] .= ' method="post"';
$converted = true;
}
# Prepare the extra input to insert
$add = empty($converted) ? '' : '<input type="hidden" name="convertGET" value="1">';
# To do: javascript onsubmit event to immediately redirect to the appropriate
# location using GET data, without an intermediate POST to the proxy script.
# Proxy the form action
$input[1] = preg_replace_callback('#\baction\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_formAction', $input[1]);
# What type of form is this? Due to register_globals support, PHP converts
# a number of characters to _ in incoming variable names. To get around this,
# we can use the raw post data from php://input but this is not available
# for multipart forms. Instead we must encode the input names in these forms.
if ( stripos($input[1], 'multipart/form-data') ) {
$input[2] = preg_replace_callback('#name\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_inputName', $input[2]);
}
# Return updated form
return '<form' . $input[1] . '>' . $add . $input[2] . '</form>';
}
# Proxy the action="URL" value in forms
function html_formAction($input) {
return 'action=' . $input[1] . proxyURL($input[2]) . $input[1];
}
# Encode input names
function html_inputName($input) {
return 'name=' . $input[1] . inputEncode($input[2]) . $input[1];
}
# Proxy URL values in attributes
function html_attribute($input) {
# Is this an iframe?
$flag = stripos($input[0], 'iframe') === 1 ? 'frame' : '';
# URL occurred as value of an attribute and should have been htmlspecialchar()ed
# We need to do the job of the browser and decode before proxying.
return str_replace($input[3], htmlspecialchars(proxyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]);
}
# Flag frames in a frameset so only the first one shows the mini-form.
# This could be done in the above callback but adds extra processing
# when 99% of the time, it won't be needed.
function html_flagFrames($input) {
static $addFlag;
# If it's the first frame, leave it but set the flag var
if ( ! isset($addFlag) ) {
$addFlag = true;
return $input[0];
}
# Add the frame flag
$newURL = $input[2] . ( strpos($input[2], '?') ? '&f=frame' : 'fframe/');
return str_replace($input[2], $newURL, $input[0]);
}
/*****************************************************************
* CSS callbacks
******************************************************************/
# Proxy CSS url(LOCATION)
function css_URL($input) {
return 'url(' . proxyURL(trim($input[1])) . ')';
}
# Proxy CSS @import "URL"
function css_import($input) {
return '@import "' . proxyURL($input[1]) . '"';
}
# Proxy CSS src=
function css_src($input) {
return 'src=' . $input[1] . proxyURL($input[2]) . $input[1];
}
# Callbacks for use with unique URLs and cached CSS
# The <UNIQUE[]URL> acts as a marker for quick and easy processing later
# Unique CSS url(LOCATION)
function css_URL_unique($input) {
return 'url(<UNIQUE[' . absoluteURL($input[1],'') . ']URL>)';
}
# Unique CSS @import "URL"
function css_import_unique($input) {
return '@import "<UNIQUE[' . absoluteURL($input[1]) . ']URL>"';
}
# Unique CSS src=
function css_src_unique($input) {
return 'src=' . $input[1] . '<UNIQUE[' . absoluteURL($input[2]) . ']URL>' . $input[1];
}
/*****************************************************************
* Helper functions
******************************************************************/
# Take a string, and check that the next non-whitespace char is the
# passed in char (X). Return false if non-whitespace and non-X char is
# found. Otherwise, return the position of X.
# If $inverse is true, the next non-whitespace char must NOT be in $char
# If $pastChar is true, ignore whitespace after finding X and return
# the position of the last post-X whitespace char.
function str_checknext($input, $char, $offset, $inverse = false, $pastChar = false) {
for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) {
# Examine char
switch ( $input[$i] ) {
# Ignore whitespace
case ' ':
case "\t":
case "\r":
case "\n":
break;
# Found the passed char
case $char:
# $inverse means we do NOT want this char
if ( $inverse ) {
return false;
}
# Move past this to the next non-whitespace?
if ( $pastChar ) {
++$i;
return $i + strspn($input, " \t\r\n", $i);
}
# Found desired char, no $pastChar, just return X offset
return $i;
# Found non-$char non-whitespace
default:
# This is the desired result if $inverse
if ( $inverse ) {
return $i;
}
# No $inverse, found a non-$char, return false
return false;
}
}
return false;
}
# Same as above but go backwards
function str_checkprev($input, $char, $offset, $inverse = false) {
for ( $i = $offset; $i > 0; --$i ) {
# Examine char
switch ( $input[$i] ) {
# Ignore whitespace
case ' ':
case "\t":
case "\r":
case "\n":
break;
# Found char
case $char:
return $inverse ? false : $i;
# Found non-$char char
default:
return $inverse ? $i : false;
}
}
return $inverse;
}
# Analyze javascript and return offset positions.
# Default is to find the end of the statement, indicated by:
# (1) ; while not in string
# (2) newline which, if not there, would create invalid syntax
# (3) a closing bracket (object, language construct or function call) for which
# no corresponding opening bracket was detected AFTER the passed offset
# If (int) $argPos is true, we return an array of the start and end position
# for the nth argument, where n = $argPos. The $start position must be just inside
# the parenthesis of the function call we're interested in.
function analyze_js($input, $start, $argPos = false) {
# Add , if looking for an argument position
if ( $argPos ) {
$currentArg = 1;
}
# Loop through the input, stopping only at special chars
for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0;
$end === false && $i < $length;
++$i ) {
$char = $input[$i];
switch ( $char ) {
# Starting string delimiters
case '"':
case "'":
if ( $input[$i-1] == '\\' ) {
break;
}
# Skip straight to end of string
# Find the corresponding end delimiter and ensure it's not escaped
while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' );
# Check for false, in which case we assume the end is the end of the doc
if ( $i === false ) {
break 2;
}
break;
# End of operation?
case ';':
$end = $i;
break;
# New lines
case "\n":
case "\r":
# Newlines are OK if occuring within an open brackets, arrays or objects.
if ( $openObjects || $openBrackets || $openArrays || $argPos ) {
break;
}
# Newlines are also OK if followed by an opening function OR concatenation
# e.g. someFunc\n(params) or someVar \n + anotherVar
# Find next non-whitespace char position
$tmp = $i + strspn($input, " \t\r\n", $i+1);
# And compare to allowed chars
if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) {
$i = $tmp;
break;
}
# Newline not indicated as OK, set the end to here
$end = $i;
break;
# Concatenation
case '+':
# Our interest in the + operator is it's use in allowing an expression
# to span multiple lines. If we come across a +, move past all whitespace,
# including newlines (which would otherwise indicate end of expression).
$i += strspn($input, " \t\r\n", $i+1);
break;
# Opening chars (objects, parenthesis and arrays)
case '{':
++$openObjects;
break;
case '(':
++$openBrackets;
break;
case '[':
++$openArrays;
break;
# Closing chars - is there a corresponding open char?
# Yes = reduce stored count. No = end of statement.
case '}':
$openObjects ? --$openObjects : $end = $i;
break;
case ')':
$openBrackets ? --$openBrackets : $end = $i;
break;
case ']':
$openArrays ? --$openArrays : $end = $i;
break;
# Commas - tell us which argument it is
case ',':
# Ignore commas inside other functions or whatnot
if ( $openObjects || $openBrackets || $openArrays ) {
break;
}
# End now
if ( $currentArg == $argPos ) {
$end = $i;
}
# Increase the current argument number
++$currentArg;
# If we're not after the first arg, start now?
if ( $currentArg == $argPos ) {
$start = $i+1;
}
break;
}
}
# End not found? Use end of document
if ( $end === false ) {
$end = $length;
}
# Return array of start/end
if ( $argPos ) {
return array($start, $end);
}
# Return end
return $end;
}
function analyzeAssign_js($input, $start) {
# Loop through the input, stopping only at special chars
for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0;
$end === false && $i < $length;
++$i ) {
$char = $input[$i];
switch ( $char ) {
# Starting string delimiters
case '"':
case "'":
if ( $input[$i-1] == '\\' ) {
break;
}
# Skip straight to end of string
# Find the corresponding end delimiter and ensure it's not escaped
while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' );
# Check for false, in which case we assume the end is the end of the doc
if ( $i === false ) {
break 2;
}
break;
# End of operation?
case ';':
$end = $i;
break;
# New lines
case "\n":
case "\r":
# Newlines are OK if occuring within an open brackets, arrays or objects.
if ( $openObjects || $openBrackets || $openArrays ) {
break;
}
break;
# Concatenation
case '+':
# Our interest in the + operator is it's use in allowing an expression
# to span multiple lines. If we come across a +, move past all whitespace,
# including newlines (which would otherwise indicate end of expression).
$i += strspn($input, " \t\r\n", $i+1);
break;
# Opening chars (objects, parenthesis and arrays)
case '{':
++$openObjects;
break;
case '(':
++$openBrackets;
break;
case '[':
++$openArrays;
break;
# Closing chars - is there a corresponding open char?
# Yes = reduce stored count. No = end of statement.
case '}':
$openObjects ? --$openObjects : $end = $i;
break;
case ')':
$openBrackets ? --$openBrackets : $end = $i;
break;
case ']':
$openArrays ? --$openArrays : $end = $i;
break;
# Commas - tell us which argument it is
case ',':
# Ignore commas inside other functions or whatnot
if ( $openObjects || $openBrackets || $openArrays ) {
break;
}
# End now
$end = $i;
break;
}
}
# End not found? Use end of document
if ( $end === false ) {
$end = $length;
}
# Return end
return $end;
}
/*****************************************************************
* Page encoding functions
******************************************************************/
# Encode page - splits into HTML/script sections and encodes HTML
function encodePage($input) {
# Look for script blocks
# if ( preg_match_all('#<(?:script|style).*?</(?:script|style)>#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) { # not working
if ( preg_match_all('#<script.*?</script>#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) {
# Create starting offset - only start encoding after the <head>
# as this seems to help browsers cope!
$offset = preg_match('#<body[^>]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0;
$new = $offset ? substr($input, 0, $offset) : '';
# Go through all the matches
foreach ( $scripts[0] as $id => $match ) {
# Determine position of the preceeding non-script block
$end = $match[1] ? $match[1]-1 : 0;
$start = $offset;
$length = $end - $start;
# Add encoded block to page if there is one
if ($length && $length>0) {
$new .= "\n\n\n<!--start encode block-->\n";
$new .= encodeBlock(substr($input, $start, $length));
$new .= "\n<!--end encode block-->\n\n\n";
}
# Add unencoded script to page
$new .= "\n\n\n<!--start unencoded block-->\n";
$new .= $match[0];
$new .= "\n<!--end unencoded block-->\n\n\n";
# Move offset up
$offset = $match[1] + strlen($match[0]);
}
# Add final block
if ( $remainder = substr($input, $offset) ) {
$new .= encodeBlock($remainder);
}
# Update input with new
$input = $new;
} else {
# No scripts is easy - just encode the lot
$input = encodeBlock($input);
}
# Return the encoded page
return $input;
}
# Encode block - applies the actual encoding (or rather "escaping")
function encodeBlock($input) {
global $charset;
$new='';
if (isset($charset)) {
$charset=strtolower($charset);
if (function_exists('mb_convert_encoding')) {
$input=mb_convert_encoding($input, 'HTML-ENTITIES', $charset);
}
}
# Return javascript decoder
return '<script type="text/javascript">document.write(arcfour(ginf.enc.u,base64_decode(\'' . arcfour('encrypt',$GLOBALS['unique_salt'],$input) . '\')));</script>';
}