source: sipes/cord/includes/unicode.inc

stableversion-3.0
Last change on this file was d7a822e, checked in by José Gregorio Puentes <jpuentes@…>, 8 años ago

se agrego el directorio del cord

  • Propiedad mode establecida a 100755
File size: 17.2 KB
Línea 
1<?php
2
3/**
4 * Indicates an error during check for PHP unicode support.
5 */
6define('UNICODE_ERROR', -1);
7
8/**
9 * Indicates that standard PHP (emulated) unicode support is being used.
10 */
11define('UNICODE_SINGLEBYTE', 0);
12
13/**
14 * Indicates that full unicode support with the PHP mbstring extension is being
15 * used.
16 */
17define('UNICODE_MULTIBYTE', 1);
18
19/**
20 * Wrapper around _unicode_check().
21 */
22function unicode_check() {
23  list($GLOBALS['multibyte']) = _unicode_check();
24}
25
26/**
27 * Perform checks about Unicode support in PHP, and set the right settings if
28 * needed.
29 *
30 * Because Drupal needs to be able to handle text in various encodings, we do
31 * not support mbstring function overloading. HTTP input/output conversion must
32 * be disabled for similar reasons.
33 *
34 * @param $errors
35 *   Whether to report any fatal errors with form_set_error().
36 */
37function _unicode_check() {
38  // Ensure translations don't break at install time
39  $t = get_t();
40
41  // Set the standard C locale to ensure consistent, ASCII-only string handling.
42  setlocale(LC_CTYPE, 'C');
43
44  // Check for outdated PCRE library
45  // Note: we check if U+E2 is in the range U+E0 - U+E1. This test returns TRUE on old PCRE versions.
46  if (preg_match('/[à-á]/u', 'â')) {
47    return array(UNICODE_ERROR, $t('The PCRE library in your PHP installation is outdated. This will cause problems when handling Unicode text. If you are running PHP 4.3.3 or higher, make sure you are using the PCRE library supplied by PHP. Please refer to the <a href="@url">PHP PCRE documentation</a> for more information.', array('@url' => 'http://www.php.net/pcre')));
48  }
49
50  // Check for mbstring extension
51  if (!function_exists('mb_strlen')) {
52    return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
53  }
54
55  // Check mbstring configuration
56  if (ini_get('mbstring.func_overload') != 0) {
57    return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
58  }
59  if (ini_get('mbstring.encoding_translation') != 0) {
60    return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
61  }
62  if (ini_get('mbstring.http_input') != 'pass') {
63    return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
64  }
65  if (ini_get('mbstring.http_output') != 'pass') {
66    return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
67  }
68
69  // Set appropriate configuration
70  mb_internal_encoding('utf-8');
71  mb_language('uni');
72  return array(UNICODE_MULTIBYTE, '');
73}
74
75/**
76 * Return Unicode library status and errors.
77 */
78function unicode_requirements() {
79  // Ensure translations don't break at install time
80  $t = get_t();
81
82  $libraries = array(
83    UNICODE_SINGLEBYTE => $t('Standard PHP'),
84    UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
85    UNICODE_ERROR => $t('Error'),
86  );
87  $severities = array(
88    UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
89    UNICODE_MULTIBYTE => REQUIREMENT_OK,
90    UNICODE_ERROR => REQUIREMENT_ERROR,
91  );
92  list($library, $description) = _unicode_check();
93
94  $requirements['unicode'] = array(
95    'title' => $t('Unicode library'),
96    'value' => $libraries[$library],
97  );
98  if ($description) {
99    $requirements['unicode']['description'] = $description;
100  }
101
102  $requirements['unicode']['severity'] = $severities[$library];
103
104  return $requirements;
105}
106
107/**
108 * Prepare a new XML parser.
109 *
110 * This is a wrapper around xml_parser_create() which extracts the encoding from
111 * the XML data first and sets the output encoding to UTF-8. This function should
112 * be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
113 * check the input encoding itself. "Starting from PHP 5, the input encoding is
114 * automatically detected, so that the encoding parameter specifies only the
115 * output encoding."
116 *
117 * This is also where unsupported encodings will be converted. Callers should
118 * take this into account: $data might have been changed after the call.
119 *
120 * @param &$data
121 *   The XML data which will be parsed later.
122 * @return
123 *   An XML parser object.
124 */
125function drupal_xml_parser_create(&$data) {
126  // Default XML encoding is UTF-8
127  $encoding = 'utf-8';
128  $bom = FALSE;
129
130  // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
131  if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
132    $bom = TRUE;
133    $data = substr($data, 3);
134  }
135
136  // Check for an encoding declaration in the XML prolog if no BOM was found.
137  if (!$bom && @ereg('^<\?xml[^>]+encoding="([^"]+)"', $data, $match)) {
138    $encoding = $match[1];
139  }
140
141  // Unsupported encodings are converted here into UTF-8.
142  $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
143  if (!in_array(strtolower($encoding), $php_supported)) {
144    $out = drupal_convert_to_utf8($data, $encoding);
145    if ($out !== FALSE) {
146      $encoding = 'utf-8';
147      $data = @ereg_replace('^(<\?xml[^>]+encoding)="([^"]+)"', '\\1="utf-8"', $out);
148    }
149    else {
150      watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);
151      return 0;
152    }
153  }
154
155  $xml_parser = xml_parser_create($encoding);
156  xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
157  return $xml_parser;
158}
159
160/**
161 * Convert data to UTF-8
162 *
163 * Requires the iconv, GNU recode or mbstring PHP extension.
164 *
165 * @param $data
166 *   The data to be converted.
167 * @param $encoding
168 *   The encoding that the data is in
169 * @return
170 *   Converted data or FALSE.
171 */
172function drupal_convert_to_utf8($data, $encoding) {
173  if (function_exists('iconv')) {
174    $out = @iconv($encoding, 'utf-8', $data);
175  }
176  else if (function_exists('mb_convert_encoding')) {
177    $out = @mb_convert_encoding($data, 'utf-8', $encoding);
178  }
179  else if (function_exists('recode_string')) {
180    $out = @recode_string($encoding .'..utf-8', $data);
181  }
182  else {
183    watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
184    return FALSE;
185  }
186
187  return $out;
188}
189
190/**
191 * Truncate a UTF-8-encoded string safely to a number of bytes.
192 *
193 * If the end position is in the middle of a UTF-8 sequence, it scans backwards
194 * until the beginning of the byte sequence.
195 *
196 * Use this function whenever you want to chop off a string at an unsure
197 * location. On the other hand, if you're sure that you're splitting on a
198 * character boundary (e.g. after using strpos() or similar), you can safely use
199 * substr() instead.
200 *
201 * @param $string
202 *   The string to truncate.
203 * @param $len
204 *   An upper limit on the returned string length.
205 * @return
206 *   The truncated string.
207 */
208function drupal_truncate_bytes($string, $len) {
209  if (strlen($string) <= $len) {
210    return $string;
211  }
212  if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
213    return substr($string, 0, $len);
214  }
215  while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0) {};
216  return substr($string, 0, $len);
217}
218
219/**
220 * Truncate a UTF-8-encoded string safely to a number of characters.
221 *
222 * @param $string
223 *   The string to truncate.
224 * @param $len
225 *   An upper limit on the returned string length.
226 * @param $wordsafe
227 *   Flag to truncate at last space within the upper limit. Defaults to FALSE.
228 * @param $dots
229 *   Flag to add trailing dots. Defaults to FALSE.
230 * @return
231 *   The truncated string.
232 */
233function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
234
235  if (drupal_strlen($string) <= $len) {
236    return $string;
237  }
238
239  if ($dots) {
240    $len -= 4;
241  }
242
243  if ($wordsafe) {
244    $string = drupal_substr($string, 0, $len + 1); // leave one more character
245    if ($last_space = strrpos($string, ' ')) { // space exists AND is not on position 0
246      $string = substr($string, 0, $last_space);
247    }
248    else {
249      $string = drupal_substr($string, 0, $len);
250    }
251  }
252  else {
253    $string = drupal_substr($string, 0, $len);
254  }
255
256  if ($dots) {
257    $string .= ' ...';
258  }
259
260  return $string;
261}
262
263/**
264 * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
265 * characters.
266 *
267 * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
268 *
269 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
270 *
271 * Notes:
272 * - Only encode strings that contain non-ASCII characters.
273 * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
274 *   each chunk starts and ends on a character boundary.
275 * - Using \n as the chunk separator may cause problems on some systems and may
276 *   have to be changed to \r\n or \r.
277 */
278function mime_header_encode($string) {
279  if (preg_match('/[^\x20-\x7E]/', $string)) {
280    $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
281    $len = strlen($string);
282    $output = '';
283    while ($len > 0) {
284      $chunk = drupal_truncate_bytes($string, $chunk_size);
285      $output .= ' =?UTF-8?B?'. base64_encode($chunk) ."?=\n";
286      $c = strlen($chunk);
287      $string = substr($string, $c);
288      $len -= $c;
289    }
290    return trim($output);
291  }
292  return $string;
293}
294
295/**
296 * Complement to mime_header_encode
297 */
298function mime_header_decode($header) {
299  // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
300  $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
301  // Second step: remaining chunks (do not collapse whitespace)
302  return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);
303}
304
305/**
306 * Helper function to mime_header_decode
307 */
308function _mime_header_decode($matches) {
309  // Regexp groups:
310  // 1: Character set name
311  // 2: Escaping method (Q or B)
312  // 3: Encoded data
313  $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
314  if (strtolower($matches[1]) != 'utf-8') {
315    $data = drupal_convert_to_utf8($data, $matches[1]);
316  }
317  return $data;
318}
319
320/**
321 * Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes.
322 *
323 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;",
324 * not "<"). Be careful when using this function, as decode_entities can revert
325 * previous sanitization efforts (&lt;script&gt; will become <script>).
326 *
327 * @param $text
328 *   The text to decode entities in.
329 * @param $exclude
330 *   An array of characters which should not be decoded. For example,
331 *   array('<', '&', '"'). This affects both named and numerical entities.
332 *
333 * @return
334 *   The input $text, with all HTML entities decoded once.
335 */
336function decode_entities($text, $exclude = array()) {
337  static $html_entities;
338  if (!isset($html_entities)) {
339    include_once './includes/unicode.entities.inc';
340  }
341
342  // Flip the exclude list so that we can do quick lookups later.
343  $exclude = array_flip($exclude);
344
345  // Use a regexp to select all entities in one pass, to avoid decoding
346  // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
347  // being used to allow for a callback (see
348  // http://php.net/manual/en/reference.pcre.pattern.modifiers).
349  return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
350}
351
352/**
353 * Helper function for decode_entities
354 */
355function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
356  // Named entity
357  if (!$prefix) {
358    // A named entity not in the exclude list.
359    if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
360      return $html_entities[$original];
361    }
362    else {
363      return $original;
364    }
365  }
366  // Hexadecimal numerical entity
367  if ($prefix == '#x') {
368    $codepoint = base_convert($codepoint, 16, 10);
369  }
370  // Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
371  else {
372    $codepoint = preg_replace('/^0+/', '', $codepoint);
373  }
374  // Encode codepoint as UTF-8 bytes
375  if ($codepoint < 0x80) {
376    $str = chr($codepoint);
377  }
378  else if ($codepoint < 0x800) {
379    $str = chr(0xC0 | ($codepoint >> 6))
380         . chr(0x80 | ($codepoint & 0x3F));
381  }
382  else if ($codepoint < 0x10000) {
383    $str = chr(0xE0 | ( $codepoint >> 12))
384         . chr(0x80 | (($codepoint >> 6) & 0x3F))
385         . chr(0x80 | ( $codepoint       & 0x3F));
386  }
387  else if ($codepoint < 0x200000) {
388    $str = chr(0xF0 | ( $codepoint >> 18))
389         . chr(0x80 | (($codepoint >> 12) & 0x3F))
390         . chr(0x80 | (($codepoint >> 6)  & 0x3F))
391         . chr(0x80 | ( $codepoint        & 0x3F));
392  }
393  // Check for excluded characters
394  if (isset($exclude[$str])) {
395    return $original;
396  }
397  else {
398    return $str;
399  }
400}
401
402/**
403 * Count the amount of characters in a UTF-8 string. This is less than or
404 * equal to the byte count.
405 */
406function drupal_strlen($text) {
407  global $multibyte;
408  if ($multibyte == UNICODE_MULTIBYTE) {
409    return mb_strlen($text);
410  }
411  else {
412    // Do not count UTF-8 continuation bytes.
413    return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
414  }
415}
416
417/**
418 * Uppercase a UTF-8 string.
419 */
420function drupal_strtoupper($text) {
421  global $multibyte;
422  if ($multibyte == UNICODE_MULTIBYTE) {
423    return mb_strtoupper($text);
424  }
425  else {
426    // Use C-locale for ASCII-only uppercase
427    $text = strtoupper($text);
428    // Case flip Latin-1 accented letters
429    $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
430    return $text;
431  }
432}
433
434/**
435 * Lowercase a UTF-8 string.
436 */
437function drupal_strtolower($text) {
438  global $multibyte;
439  if ($multibyte == UNICODE_MULTIBYTE) {
440    return mb_strtolower($text);
441  }
442  else {
443    // Use C-locale for ASCII-only lowercase
444    $text = strtolower($text);
445    // Case flip Latin-1 accented letters
446    $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
447    return $text;
448  }
449}
450
451/**
452 * Helper function for case conversion of Latin-1.
453 * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
454 */
455function _unicode_caseflip($matches) {
456  return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
457}
458
459/**
460 * Capitalize the first letter of a UTF-8 string.
461 */
462function drupal_ucfirst($text) {
463  // Note: no mbstring equivalent!
464  return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
465}
466
467/**
468 * Cut off a piece of a string based on character indices and counts. Follows
469 * the same behavior as PHP's own substr() function.
470 *
471 * Note that for cutting off a string at a known character/substring
472 * location, the usage of PHP's normal strpos/substr is safe and
473 * much faster.
474 */
475function drupal_substr($text, $start, $length = NULL) {
476  global $multibyte;
477  if ($multibyte == UNICODE_MULTIBYTE) {
478    return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
479  }
480  else {
481    $strlen = strlen($text);
482    // Find the starting byte offset
483    $bytes = 0;
484    if ($start > 0) {
485      // Count all the continuation bytes from the start until we have found
486      // $start characters
487      $bytes = -1; $chars = -1;
488      while ($bytes < $strlen && $chars < $start) {
489        $bytes++;
490        $c = ord($text[$bytes]);
491        if ($c < 0x80 || $c >= 0xC0) {
492          $chars++;
493        }
494      }
495    }
496    else if ($start < 0) {
497      // Count all the continuation bytes from the end until we have found
498      // abs($start) characters
499      $start = abs($start);
500      $bytes = $strlen; $chars = 0;
501      while ($bytes > 0 && $chars < $start) {
502        $bytes--;
503        $c = ord($text[$bytes]);
504        if ($c < 0x80 || $c >= 0xC0) {
505          $chars++;
506        }
507      }
508    }
509    $istart = $bytes;
510
511    // Find the ending byte offset
512    if ($length === NULL) {
513      $bytes = $strlen - 1;
514    }
515    else if ($length > 0) {
516      // Count all the continuation bytes from the starting index until we have
517      // found $length + 1 characters. Then backtrack one byte.
518      $bytes = $istart; $chars = 0;
519      while ($bytes < $strlen && $chars < $length) {
520        $bytes++;
521        $c = ord($text[$bytes]);
522        if ($c < 0x80 || $c >= 0xC0) {
523          $chars++;
524        }
525      }
526      $bytes--;
527    }
528    else if ($length < 0) {
529      // Count all the continuation bytes from the end until we have found
530      // abs($length) characters
531      $length = abs($length);
532      $bytes = $strlen - 1; $chars = 0;
533      while ($bytes >= 0 && $chars < $length) {
534        $c = ord($text[$bytes]);
535        if ($c < 0x80 || $c >= 0xC0) {
536          $chars++;
537        }
538        $bytes--;
539      }
540    }
541    $iend = $bytes;
542
543    return substr($text, $istart, max(0, $iend - $istart + 1));
544  }
545}
546
547
Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.