1 | <?php |
---|
2 | // $Id $ |
---|
3 | |
---|
4 | /** |
---|
5 | * @file |
---|
6 | * Helper class to clean strings to make them URL safe and translatable. |
---|
7 | * |
---|
8 | * This was copied directly from pathauto and put here to be made available |
---|
9 | * to all, because more things than just pathauto want URL safe strings. |
---|
10 | * |
---|
11 | * To use, simply: |
---|
12 | * @code |
---|
13 | * ctools_include('cleanstring'); |
---|
14 | * $output = ctools_cleanstring($string); |
---|
15 | * |
---|
16 | * You can add a variety of settings as an array in the second argument, |
---|
17 | * including words to ignore, how to deal with punctuation, length |
---|
18 | * limits, and more. See the function itself for options. |
---|
19 | */ |
---|
20 | |
---|
21 | /** |
---|
22 | * Matches Unicode character classes. |
---|
23 | * |
---|
24 | * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values |
---|
25 | * |
---|
26 | * The index only contains the following character classes: |
---|
27 | * Lu Letter, Uppercase |
---|
28 | * Ll Letter, Lowercase |
---|
29 | * Lt Letter, Titlecase |
---|
30 | * Lo Letter, Other |
---|
31 | * Nd Number, Decimal Digit |
---|
32 | * No Number, Other |
---|
33 | * |
---|
34 | * Copied from search.module's PREG_CLASS_SEARCH_EXCLUDE. |
---|
35 | */ |
---|
36 | define('CTOOLS_PREG_CLASS_ALNUM', |
---|
37 | '\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'. |
---|
38 | '\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'. |
---|
39 | '\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'. |
---|
40 | '\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'. |
---|
41 | '\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'. |
---|
42 | '\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'. |
---|
43 | '\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'. |
---|
44 | '\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'. |
---|
45 | '\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'. |
---|
46 | '\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'. |
---|
47 | '\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'. |
---|
48 | '\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'. |
---|
49 | '\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'. |
---|
50 | '\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'. |
---|
51 | '\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'. |
---|
52 | '\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'. |
---|
53 | '\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'. |
---|
54 | '\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'. |
---|
55 | '\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'. |
---|
56 | '\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'. |
---|
57 | '\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'. |
---|
58 | '\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'. |
---|
59 | '\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'. |
---|
60 | '\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'. |
---|
61 | '\x{a80b}\x{a823}-\x{a82b}\x{e000}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'. |
---|
62 | '\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'. |
---|
63 | '\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}'); |
---|
64 | |
---|
65 | |
---|
66 | /** |
---|
67 | * Clean up a string value provided by a module. |
---|
68 | * |
---|
69 | * Resulting string contains only alphanumerics and separators. |
---|
70 | * |
---|
71 | * @param $string |
---|
72 | * A string to clean. |
---|
73 | * @param $settings |
---|
74 | * An optional array of settings to use. |
---|
75 | * - 'clean slash': If set, slashes will be cleaned. Defaults to TRUE, |
---|
76 | * so you have to explicitly set this to FALSE to not clean the |
---|
77 | * slashes. |
---|
78 | * - 'ignore words': Set to an array of words that will be removed |
---|
79 | * rather than made safe. Defaults to an empty array. |
---|
80 | * - 'separator': Change spaces and untranslatable characters to |
---|
81 | * this character. Defaults to '-'. |
---|
82 | * - 'replacements': An array of direct replacements to be made that will |
---|
83 | * be implemented via strtr(). Defaults to an empty array. |
---|
84 | * - 'transliterate': If set, use the transliteration replacements. If set |
---|
85 | * to an array, use these replacements instead of the defaults in CTools. |
---|
86 | * Defaults to FALSE. |
---|
87 | * - 'reduce ascii': If set to TRUE further reduce to ASCII96 only. Defaults |
---|
88 | * to TRUE. |
---|
89 | * - 'max length': If set to a number, reduce the resulting string to this |
---|
90 | * maximum length. Defaults to no maximum length. |
---|
91 | * - 'lower case': If set to TRUE, convert the result to lower case. |
---|
92 | * Defaults to false. |
---|
93 | * These settings will be passed through drupal_alter. |
---|
94 | * |
---|
95 | * @return |
---|
96 | * The cleaned string. |
---|
97 | */ |
---|
98 | function ctools_cleanstring($string, $settings = array()) { |
---|
99 | $settings += array( |
---|
100 | 'clean slash' => TRUE, |
---|
101 | 'ignore words' => array(), |
---|
102 | 'separator' => '-', |
---|
103 | 'replacements' => array(), |
---|
104 | 'transliterate' => FALSE, |
---|
105 | 'reduce ascii' => TRUE, |
---|
106 | 'max length' => FALSE, |
---|
107 | 'lower case' => FALSE, |
---|
108 | ); |
---|
109 | |
---|
110 | // Allow modules to make other changes to the settings. |
---|
111 | if (isset($settings['clean id'])) { |
---|
112 | drupal_alter('ctools_cleanstring_' . $settings['clean id'], $settings); |
---|
113 | } |
---|
114 | |
---|
115 | drupal_alter('ctools_cleanstring', $settings); |
---|
116 | |
---|
117 | $output = $string; |
---|
118 | |
---|
119 | // Do any replacements the user selected up front. |
---|
120 | if (!empty($settings['replacements'])) { |
---|
121 | $output = strtr($output, $settings['replacements']); |
---|
122 | } |
---|
123 | |
---|
124 | // Remove slashes if instructed to do so. |
---|
125 | if ($settings['clean slash']) { |
---|
126 | $output = str_replace('/', '', $output); |
---|
127 | } |
---|
128 | |
---|
129 | if (!empty($settings['transliterate']) && module_exists('transliteration')) { |
---|
130 | $output = transliteration_get($output); |
---|
131 | } |
---|
132 | |
---|
133 | // Reduce to the subset of ASCII96 letters and numbers |
---|
134 | if ($settings['reduce ascii']) { |
---|
135 | $pattern = '/[^a-zA-Z0-9\/]+/'; |
---|
136 | $output = preg_replace($pattern, $settings['separator'], $output); |
---|
137 | } |
---|
138 | |
---|
139 | // Get rid of words that are on the ignore list |
---|
140 | if (!empty($settings['ignore words'])) { |
---|
141 | $ignore_re = '\b'. preg_replace('/,/', '\b|\b', $settings['ignore words']) .'\b'; |
---|
142 | |
---|
143 | if (function_exists('mb_eregi_replace')) { |
---|
144 | $output = mb_eregi_replace($ignore_re, '', $output); |
---|
145 | } |
---|
146 | else { |
---|
147 | $output = preg_replace("/$ignore_re/i", '', $output); |
---|
148 | } |
---|
149 | } |
---|
150 | |
---|
151 | // Always replace whitespace with the separator. |
---|
152 | $output = preg_replace('/\s+/', $settings['separator'], $output); |
---|
153 | |
---|
154 | // In preparation for pattern matching, |
---|
155 | // escape the separator if and only if it is not alphanumeric. |
---|
156 | if (isset($settings['separator'])) { |
---|
157 | if (preg_match('/^[^'. CTOOLS_PREG_CLASS_ALNUM .']+$/uD', $settings['separator'])) { |
---|
158 | $seppattern = $settings['separator']; |
---|
159 | } |
---|
160 | else { |
---|
161 | $seppattern = '\\'. $settings['separator']; |
---|
162 | } |
---|
163 | // Trim any leading or trailing separators (note the need to |
---|
164 | $output = preg_replace("/^$seppattern+|$seppattern+$/", '', $output); |
---|
165 | |
---|
166 | // Replace multiple separators with a single one |
---|
167 | $output = preg_replace("/$seppattern+/", $settings['separator'], $output); |
---|
168 | } |
---|
169 | |
---|
170 | // Enforce the maximum component length |
---|
171 | if (!empty($settings['max length'])) { |
---|
172 | $output = ctools_cleanstring_truncate($output, $settings['max length'], $settings['separator']); |
---|
173 | } |
---|
174 | |
---|
175 | if (!empty($settings['lower case'])) { |
---|
176 | $output = drupal_strtolower($output); |
---|
177 | } |
---|
178 | return $output; |
---|
179 | } |
---|
180 | |
---|
181 | /** |
---|
182 | * A friendly version of truncate_utf8. |
---|
183 | * |
---|
184 | * @param $string |
---|
185 | * The string to be truncated. |
---|
186 | * @param $length |
---|
187 | * An integer for the maximum desired length. |
---|
188 | * @param $separator |
---|
189 | * A string which contains the word boundary such as - or _. |
---|
190 | * |
---|
191 | * @return |
---|
192 | * The string truncated below the maxlength. |
---|
193 | */ |
---|
194 | function ctools_cleanstring_truncate($string, $length, $separator) { |
---|
195 | if (drupal_strlen($string) > $length) { |
---|
196 | $string = drupal_substr($string, 0, $length + 1); // leave one more character |
---|
197 | if ($last_break = strrpos($string, $separator)) { // space exists AND is not on position 0 |
---|
198 | $string = substr($string, 0, $last_break); |
---|
199 | } |
---|
200 | else { |
---|
201 | $string = drupal_substr($string, 0, $length); |
---|
202 | } |
---|
203 | } |
---|
204 | return $string; |
---|
205 | } |
---|