Navegación de contexto

unicode.inc

stableversion-3.0

Last change on this file was d7a822e, checked in by José Gregorio Puentes <jpuentes@…>, 8 años ago
se agrego el directorio del cord
Propiedad mode establecida a `100755`
File size: 17.2 KB

Línea
1	<?php
2
3	/**
4	* Indicates an error during check for PHP unicode support.
5	*/
6	define('UNICODE_ERROR', -1);
7
8	/**
9	* Indicates that standard PHP (emulated) unicode support is being used.
10	*/
11	define('UNICODE_SINGLEBYTE', 0);
12
13	/**
14	* Indicates that full unicode support with the PHP mbstring extension is being
15	* used.
16	*/
17	define('UNICODE_MULTIBYTE', 1);
18
19	/**
20	* Wrapper around _unicode_check().
21	*/
22	function unicode_check() {
23	list($GLOBALS['multibyte']) = _unicode_check();
24	}
25
26	/**
27	* Perform checks about Unicode support in PHP, and set the right settings if
28	* needed.
29	*
30	* Because Drupal needs to be able to handle text in various encodings, we do
31	* not support mbstring function overloading. HTTP input/output conversion must
32	* be disabled for similar reasons.
33	*
34	* @param $errors
35	* Whether to report any fatal errors with form_set_error().
36	*/
37	function _unicode_check() {
38	// Ensure translations don't break at install time
39	$t = get_t();
40
41	// Set the standard C locale to ensure consistent, ASCII-only string handling.
42	setlocale(LC_CTYPE, 'C');
43
44	// Check for outdated PCRE library
45	// Note: we check if U+E2 is in the range U+E0 - U+E1. This test returns TRUE on old PCRE versions.
46	if (preg_match('/[Ã -Ã¡]/u', 'Ã¢')) {
47	return array(UNICODE_ERROR, $t('The PCRE library in your PHP installation is outdated. This will cause problems when handling Unicode text. If you are running PHP 4.3.3 or higher, make sure you are using the PCRE library supplied by PHP. Please refer to the <a href="@url">PHP PCRE documentation</a> for more information.', array('@url' => 'http://www.php.net/pcre')));
48	}
49
50	// Check for mbstring extension
51	if (!function_exists('mb_strlen')) {
52	return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
53	}
54
55	// Check mbstring configuration
56	if (ini_get('mbstring.func_overload') != 0) {
57	return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
58	}
59	if (ini_get('mbstring.encoding_translation') != 0) {
60	return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
61	}
62	if (ini_get('mbstring.http_input') != 'pass') {
63	return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
64	}
65	if (ini_get('mbstring.http_output') != 'pass') {
66	return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
67	}
68
69	// Set appropriate configuration
70	mb_internal_encoding('utf-8');
71	mb_language('uni');
72	return array(UNICODE_MULTIBYTE, '');
73	}
74
75	/**
76	* Return Unicode library status and errors.
77	*/
78	function unicode_requirements() {
79	// Ensure translations don't break at install time
80	$t = get_t();
81
82	$libraries = array(
83	UNICODE_SINGLEBYTE => $t('Standard PHP'),
84	UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
85	UNICODE_ERROR => $t('Error'),
86	);
87	$severities = array(
88	UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
89	UNICODE_MULTIBYTE => REQUIREMENT_OK,
90	UNICODE_ERROR => REQUIREMENT_ERROR,
91	);
92	list($library, $description) = _unicode_check();
93
94	$requirements['unicode'] = array(
95	'title' => $t('Unicode library'),
96	'value' => $libraries[$library],
97	);
98	if ($description) {
99	$requirements['unicode']['description'] = $description;
100	}
101
102	$requirements['unicode']['severity'] = $severities[$library];
103
104	return $requirements;
105	}
106
107	/**
108	* Prepare a new XML parser.
109	*
110	* This is a wrapper around xml_parser_create() which extracts the encoding from
111	* the XML data first and sets the output encoding to UTF-8. This function should
112	* be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
113	* check the input encoding itself. "Starting from PHP 5, the input encoding is
114	* automatically detected, so that the encoding parameter specifies only the
115	* output encoding."
116	*
117	* This is also where unsupported encodings will be converted. Callers should
118	* take this into account: $data might have been changed after the call.
119	*
120	* @param &$data
121	* The XML data which will be parsed later.
122	* @return
123	* An XML parser object.
124	*/
125	function drupal_xml_parser_create(&$data) {
126	// Default XML encoding is UTF-8
127	$encoding = 'utf-8';
128	$bom = FALSE;
129
130	// Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
131	if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
132	$bom = TRUE;
133	$data = substr($data, 3);
134	}
135
136	// Check for an encoding declaration in the XML prolog if no BOM was found.
137	if (!$bom && @ereg('^<\?xml[^>]+encoding="([^"]+)"', $data, $match)) {
138	$encoding = $match[1];
139	}
140
141	// Unsupported encodings are converted here into UTF-8.
142	$php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
143	if (!in_array(strtolower($encoding), $php_supported)) {
144	$out = drupal_convert_to_utf8($data, $encoding);
145	if ($out !== FALSE) {
146	$encoding = 'utf-8';
147	$data = @ereg_replace('^(<\?xml[^>]+encoding)="([^"]+)"', '\\1="utf-8"', $out);
148	}
149	else {
150	watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);
151	return 0;
152	}
153	}
154
155	$xml_parser = xml_parser_create($encoding);
156	xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
157	return $xml_parser;
158	}
159
160	/**
161	* Convert data to UTF-8
162	*
163	* Requires the iconv, GNU recode or mbstring PHP extension.
164	*
165	* @param $data
166	* The data to be converted.
167	* @param $encoding
168	* The encoding that the data is in
169	* @return
170	* Converted data or FALSE.
171	*/
172	function drupal_convert_to_utf8($data, $encoding) {
173	if (function_exists('iconv')) {
174	$out = @iconv($encoding, 'utf-8', $data);
175	}
176	else if (function_exists('mb_convert_encoding')) {
177	$out = @mb_convert_encoding($data, 'utf-8', $encoding);
178	}
179	else if (function_exists('recode_string')) {
180	$out = @recode_string($encoding .'..utf-8', $data);
181	}
182	else {
183	watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
184	return FALSE;
185	}
186
187	return $out;
188	}
189
190	/**
191	* Truncate a UTF-8-encoded string safely to a number of bytes.
192	*
193	* If the end position is in the middle of a UTF-8 sequence, it scans backwards
194	* until the beginning of the byte sequence.
195	*
196	* Use this function whenever you want to chop off a string at an unsure
197	* location. On the other hand, if you're sure that you're splitting on a
198	* character boundary (e.g. after using strpos() or similar), you can safely use
199	* substr() instead.
200	*
201	* @param $string
202	* The string to truncate.
203	* @param $len
204	* An upper limit on the returned string length.
205	* @return
206	* The truncated string.
207	*/
208	function drupal_truncate_bytes($string, $len) {
209	if (strlen($string) <= $len) {
210	return $string;
211	}
212	if ((ord($string[$len]) < 0x80) \|\| (ord($string[$len]) >= 0xC0)) {
213	return substr($string, 0, $len);
214	}
215	while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0) {};
216	return substr($string, 0, $len);
217	}
218
219	/**
220	* Truncate a UTF-8-encoded string safely to a number of characters.
221	*
222	* @param $string
223	* The string to truncate.
224	* @param $len
225	* An upper limit on the returned string length.
226	* @param $wordsafe
227	* Flag to truncate at last space within the upper limit. Defaults to FALSE.
228	* @param $dots
229	* Flag to add trailing dots. Defaults to FALSE.
230	* @return
231	* The truncated string.
232	*/
233	function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
234
235	if (drupal_strlen($string) <= $len) {
236	return $string;
237	}
238
239	if ($dots) {
240	$len -= 4;
241	}
242
243	if ($wordsafe) {
244	$string = drupal_substr($string, 0, $len + 1); // leave one more character
245	if ($last_space = strrpos($string, ' ')) { // space exists AND is not on position 0
246	$string = substr($string, 0, $last_space);
247	}
248	else {
249	$string = drupal_substr($string, 0, $len);
250	}
251	}
252	else {
253	$string = drupal_substr($string, 0, $len);
254	}
255
256	if ($dots) {
257	$string .= ' ...';
258	}
259
260	return $string;
261	}
262
263	/**
264	* Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
265	* characters.
266	*
267	* For example, mime_header_encode('tÃ©st.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
268	*
269	* See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
270	*
271	* Notes:
272	* - Only encode strings that contain non-ASCII characters.
273	* - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
274	* each chunk starts and ends on a character boundary.
275	* - Using \n as the chunk separator may cause problems on some systems and may
276	* have to be changed to \r\n or \r.
277	*/
278	function mime_header_encode($string) {
279	if (preg_match('/[^\x20-\x7E]/', $string)) {
280	$chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
281	$len = strlen($string);
282	$output = '';
283	while ($len > 0) {
284	$chunk = drupal_truncate_bytes($string, $chunk_size);
285	$output .= ' =?UTF-8?B?'. base64_encode($chunk) ."?=\n";
286	$c = strlen($chunk);
287	$string = substr($string, $c);
288	$len -= $c;
289	}
290	return trim($output);
291	}
292	return $string;
293	}
294
295	/**
296	* Complement to mime_header_encode
297	*/
298	function mime_header_decode($header) {
299	// First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
300	$header = preg_replace_callback('/=\?([^?]+)\?(Q\|B)\?([^?]+\|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
301	// Second step: remaining chunks (do not collapse whitespace)
302	return preg_replace_callback('/=\?([^?]+)\?(Q\|B)\?([^?]+\|\?(?!=))\?=/', '_mime_header_decode', $header);
303	}
304
305	/**
306	* Helper function to mime_header_decode
307	*/
308	function _mime_header_decode($matches) {
309	// Regexp groups:
310	// 1: Character set name
311	// 2: Escaping method (Q or B)
312	// 3: Encoded data
313	$data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
314	if (strtolower($matches[1]) != 'utf-8') {
315	$data = drupal_convert_to_utf8($data, $matches[1]);
316	}
317	return $data;
318	}
319
320	/**
321	* Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes.
322	*
323	* Double-escaped entities will only be decoded once ("&lt;" becomes "<",
324	* not "<"). Be careful when using this function, as decode_entities can revert
325	* previous sanitization efforts (<script> will become <script>).
326	*
327	* @param $text
328	* The text to decode entities in.
329	* @param $exclude
330	* An array of characters which should not be decoded. For example,
331	* array('<', '&', '"'). This affects both named and numerical entities.
332	*
333	* @return
334	* The input $text, with all HTML entities decoded once.
335	*/
336	function decode_entities($text, $exclude = array()) {
337	static $html_entities;
338	if (!isset($html_entities)) {
339	include_once './includes/unicode.entities.inc';
340	}
341
342	// Flip the exclude list so that we can do quick lookups later.
343	$exclude = array_flip($exclude);
344
345	// Use a regexp to select all entities in one pass, to avoid decoding
346	// double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
347	// being used to allow for a callback (see
348	// http://php.net/manual/en/reference.pcre.pattern.modifiers).
349	return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
350	}
351
352	/**
353	* Helper function for decode_entities
354	*/
355	function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
356	// Named entity
357	if (!$prefix) {
358	// A named entity not in the exclude list.
359	if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
360	return $html_entities[$original];
361	}
362	else {
363	return $original;
364	}
365	}
366	// Hexadecimal numerical entity
367	if ($prefix == '#x') {
368	$codepoint = base_convert($codepoint, 16, 10);
369	}
370	// Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
371	else {
372	$codepoint = preg_replace('/^0+/', '', $codepoint);
373	}
374	// Encode codepoint as UTF-8 bytes
375	if ($codepoint < 0x80) {
376	$str = chr($codepoint);
377	}
378	else if ($codepoint < 0x800) {
379	$str = chr(0xC0 \| ($codepoint >> 6))
380	. chr(0x80 \| ($codepoint & 0x3F));
381	}
382	else if ($codepoint < 0x10000) {
383	$str = chr(0xE0 \| ( $codepoint >> 12))
384	. chr(0x80 \| (($codepoint >> 6) & 0x3F))
385	. chr(0x80 \| ( $codepoint & 0x3F));
386	}
387	else if ($codepoint < 0x200000) {
388	$str = chr(0xF0 \| ( $codepoint >> 18))
389	. chr(0x80 \| (($codepoint >> 12) & 0x3F))
390	. chr(0x80 \| (($codepoint >> 6) & 0x3F))
391	. chr(0x80 \| ( $codepoint & 0x3F));
392	}
393	// Check for excluded characters
394	if (isset($exclude[$str])) {
395	return $original;
396	}
397	else {
398	return $str;
399	}
400	}
401
402	/**
403	* Count the amount of characters in a UTF-8 string. This is less than or
404	* equal to the byte count.
405	*/
406	function drupal_strlen($text) {
407	global $multibyte;
408	if ($multibyte == UNICODE_MULTIBYTE) {
409	return mb_strlen($text);
410	}
411	else {
412	// Do not count UTF-8 continuation bytes.
413	return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
414	}
415	}
416
417	/**
418	* Uppercase a UTF-8 string.
419	*/
420	function drupal_strtoupper($text) {
421	global $multibyte;
422	if ($multibyte == UNICODE_MULTIBYTE) {
423	return mb_strtoupper($text);
424	}
425	else {
426	// Use C-locale for ASCII-only uppercase
427	$text = strtoupper($text);
428	// Case flip Latin-1 accented letters
429	$text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
430	return $text;
431	}
432	}
433
434	/**
435	* Lowercase a UTF-8 string.
436	*/
437	function drupal_strtolower($text) {
438	global $multibyte;
439	if ($multibyte == UNICODE_MULTIBYTE) {
440	return mb_strtolower($text);
441	}
442	else {
443	// Use C-locale for ASCII-only lowercase
444	$text = strtolower($text);
445	// Case flip Latin-1 accented letters
446	$text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
447	return $text;
448	}
449	}
450
451	/**
452	* Helper function for case conversion of Latin-1.
453	* Used for flipping U+C0-U+DE to U+E0-U+FD and back.
454	*/
455	function _unicode_caseflip($matches) {
456	return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
457	}
458
459	/**
460	* Capitalize the first letter of a UTF-8 string.
461	*/
462	function drupal_ucfirst($text) {
463	// Note: no mbstring equivalent!
464	return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
465	}
466
467	/**
468	* Cut off a piece of a string based on character indices and counts. Follows
469	* the same behavior as PHP's own substr() function.
470	*
471	* Note that for cutting off a string at a known character/substring
472	* location, the usage of PHP's normal strpos/substr is safe and
473	* much faster.
474	*/
475	function drupal_substr($text, $start, $length = NULL) {
476	global $multibyte;
477	if ($multibyte == UNICODE_MULTIBYTE) {
478	return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
479	}
480	else {
481	$strlen = strlen($text);
482	// Find the starting byte offset
483	$bytes = 0;
484	if ($start > 0) {
485	// Count all the continuation bytes from the start until we have found
486	// $start characters
487	$bytes = -1; $chars = -1;
488	while ($bytes < $strlen && $chars < $start) {
489	$bytes++;
490	$c = ord($text[$bytes]);
491	if ($c < 0x80 \|\| $c >= 0xC0) {
492	$chars++;
493	}
494	}
495	}
496	else if ($start < 0) {
497	// Count all the continuation bytes from the end until we have found
498	// abs($start) characters
499	$start = abs($start);
500	$bytes = $strlen; $chars = 0;
501	while ($bytes > 0 && $chars < $start) {
502	$bytes--;
503	$c = ord($text[$bytes]);
504	if ($c < 0x80 \|\| $c >= 0xC0) {
505	$chars++;
506	}
507	}
508	}
509	$istart = $bytes;
510
511	// Find the ending byte offset
512	if ($length === NULL) {
513	$bytes = $strlen - 1;
514	}
515	else if ($length > 0) {
516	// Count all the continuation bytes from the starting index until we have
517	// found $length + 1 characters. Then backtrack one byte.
518	$bytes = $istart; $chars = 0;
519	while ($bytes < $strlen && $chars < $length) {
520	$bytes++;
521	$c = ord($text[$bytes]);
522	if ($c < 0x80 \|\| $c >= 0xC0) {
523	$chars++;
524	}
525	}
526	$bytes--;
527	}
528	else if ($length < 0) {
529	// Count all the continuation bytes from the end until we have found
530	// abs($length) characters
531	$length = abs($length);
532	$bytes = $strlen - 1; $chars = 0;
533	while ($bytes >= 0 && $chars < $length) {
534	$c = ord($text[$bytes]);
535	if ($c < 0x80 \|\| $c >= 0xC0) {
536	$chars++;
537	}
538	$bytes--;
539	}
540	}
541	$iend = $bytes;
542
543	return substr($text, $istart, max(0, $iend - $istart + 1));
544	}
545	}
546
547

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos:

Formato original