1 | <?php |
---|
2 | |
---|
3 | /** |
---|
4 | * Contains CSV Parser. |
---|
5 | * Functions in this file are independent of the Feeds specific implementation. |
---|
6 | * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this |
---|
7 | * file. |
---|
8 | */ |
---|
9 | |
---|
10 | /** |
---|
11 | * Text lines from file iterator. |
---|
12 | */ |
---|
13 | class ParserCSVIterator implements Iterator { |
---|
14 | private $handle; |
---|
15 | private $currentLine; |
---|
16 | private $currentPos; |
---|
17 | |
---|
18 | public function __construct($filepath) { |
---|
19 | $this->handle = fopen($filepath, 'r'); |
---|
20 | $this->currentLine = NULL; |
---|
21 | $this->currentPos = NULL; |
---|
22 | } |
---|
23 | |
---|
24 | function __destruct() { |
---|
25 | if ($this->handle) { |
---|
26 | fclose($this->handle); |
---|
27 | } |
---|
28 | } |
---|
29 | |
---|
30 | public function rewind($pos = 0) { |
---|
31 | if ($this->handle) { |
---|
32 | fseek($this->handle, $pos); |
---|
33 | $this->next(); |
---|
34 | } |
---|
35 | } |
---|
36 | |
---|
37 | public function next() { |
---|
38 | if ($this->handle) { |
---|
39 | $this->currentLine = feof($this->handle) ? NULL : fgets($this->handle); |
---|
40 | $this->currentPos = ftell($this->handle); |
---|
41 | return $this->currentLine; |
---|
42 | } |
---|
43 | } |
---|
44 | |
---|
45 | public function valid() { |
---|
46 | return isset($this->currentLine); |
---|
47 | } |
---|
48 | |
---|
49 | public function current() { |
---|
50 | return $this->currentLine; |
---|
51 | } |
---|
52 | |
---|
53 | public function currentPos() { |
---|
54 | return $this->currentPos; |
---|
55 | } |
---|
56 | |
---|
57 | public function key() { |
---|
58 | return 'line'; |
---|
59 | } |
---|
60 | } |
---|
61 | |
---|
62 | /** |
---|
63 | * Functionality to parse CSV files into a two dimensional array. |
---|
64 | */ |
---|
65 | class ParserCSV { |
---|
66 | private $delimiter; |
---|
67 | private $skipFirstLine; |
---|
68 | private $columnNames; |
---|
69 | private $timeout; |
---|
70 | private $timeoutReached; |
---|
71 | private $startByte; |
---|
72 | private $lineLimit; |
---|
73 | private $lastLinePos; |
---|
74 | |
---|
75 | public function __construct() { |
---|
76 | $this->delimiter = ','; |
---|
77 | $this->skipFirstLine = FALSE; |
---|
78 | $this->columnNames = FALSE; |
---|
79 | $this->timeout = FALSE; |
---|
80 | $this->timeoutReached = FALSE; |
---|
81 | $this->startByte = 0; |
---|
82 | $this->lineLimit = 0; |
---|
83 | $this->lastLinePos = 0; |
---|
84 | ini_set('auto_detect_line_endings', TRUE); |
---|
85 | } |
---|
86 | |
---|
87 | /** |
---|
88 | * Set the column delimiter string. |
---|
89 | * By default, the comma (',') is used as delimiter. |
---|
90 | */ |
---|
91 | public function setDelimiter($delimiter) { |
---|
92 | $this->delimiter = $delimiter; |
---|
93 | } |
---|
94 | |
---|
95 | /** |
---|
96 | * Set this to TRUE if the parser should skip the first line of the CSV text, |
---|
97 | * which might be desired if the first line contains the column names. |
---|
98 | * By default, this is set to FALSE and the first line is not skipped. |
---|
99 | */ |
---|
100 | public function setSkipFirstLine($skipFirstLine) { |
---|
101 | $this->skipFirstLine = $skipFirstLine; |
---|
102 | } |
---|
103 | |
---|
104 | /** |
---|
105 | * Specify an array of column names if you know them in advance, or FALSE |
---|
106 | * (which is the default) to unset any prior column names. If no column names |
---|
107 | * are set, the parser will put each row into a simple numerically indexed |
---|
108 | * array. If column names are given, the parser will create arrays with |
---|
109 | * these column names as array keys instead. |
---|
110 | */ |
---|
111 | public function setColumnNames($columnNames) { |
---|
112 | $this->columnNames = $columnNames; |
---|
113 | } |
---|
114 | |
---|
115 | /** |
---|
116 | * Define the time (in milliseconds) after which the parser stops parsing, |
---|
117 | * even if it has not yet finished processing the CSV data. If the timeout |
---|
118 | * has been reached before parsing is done, the parse() method will return |
---|
119 | * an incomplete list of rows - a single row will never be cut off in the |
---|
120 | * middle, though. By default, no timeout (@p $timeout == FALSE) is defined. |
---|
121 | * |
---|
122 | * You can check if the timeout has been reached by calling the |
---|
123 | * timeoutReached() method after parse() has been called. |
---|
124 | */ |
---|
125 | public function setTimeout($timeout) { |
---|
126 | $this->timeout = $timeout; |
---|
127 | } |
---|
128 | |
---|
129 | /** |
---|
130 | * After calling the parse() method, determine if the timeout (set by the |
---|
131 | * setTimeout() method) has been reached. |
---|
132 | * |
---|
133 | * @deprecated Use lastLinePos() instead to determine whether a file has |
---|
134 | * finished parsing. |
---|
135 | */ |
---|
136 | public function timeoutReached() { |
---|
137 | return $this->timeoutReached; |
---|
138 | } |
---|
139 | |
---|
140 | /** |
---|
141 | * Define the number of lines to parse in one parsing operation. |
---|
142 | * |
---|
143 | * By default, all lines of a file are being parsed. |
---|
144 | */ |
---|
145 | public function setLineLimit($lines) { |
---|
146 | $this->lineLimit = $lines; |
---|
147 | } |
---|
148 | |
---|
149 | /** |
---|
150 | * Get the byte number where the parser left off after last parse() call. |
---|
151 | * |
---|
152 | * @return |
---|
153 | * 0 if all lines or no line has been parsed, the byte position of where a |
---|
154 | * timeout or the line limit has been reached otherwise. This position can be |
---|
155 | * used to set the start byte for the next iteration after parse() has |
---|
156 | * reached the timeout set with setTimeout() or the line limit set with |
---|
157 | * setLineLimit(). |
---|
158 | * |
---|
159 | * @see ParserCSV::setStartByte($start); |
---|
160 | */ |
---|
161 | public function lastLinePos() { |
---|
162 | return $this->lastLinePos; |
---|
163 | } |
---|
164 | |
---|
165 | /** |
---|
166 | * Set the byte where file should be started to read. |
---|
167 | * |
---|
168 | * Useful when parsing a file in batches. |
---|
169 | */ |
---|
170 | public function setStartByte($start) { |
---|
171 | return $this->startByte = $start; |
---|
172 | } |
---|
173 | |
---|
174 | /** |
---|
175 | * Parse CSV files into a two dimensional array. |
---|
176 | * |
---|
177 | * @param Iterator $lineIterator |
---|
178 | * An Iterator object that yields line strings, e.g. ParserCSVIterator. |
---|
179 | * @param $start |
---|
180 | * The byte number from where to start parsing the file. |
---|
181 | * @param $lines |
---|
182 | * The number of lines to parse, 0 for all lines. |
---|
183 | * @return |
---|
184 | * Two dimensional array that contains the data in the CSV file. |
---|
185 | */ |
---|
186 | public function parse(Iterator $lineIterator) { |
---|
187 | $skipLine = $this->skipFirstLine; |
---|
188 | $rows = array(); |
---|
189 | |
---|
190 | $this->timeoutReached = FALSE; |
---|
191 | $this->lastLinePos = 0; |
---|
192 | $maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout); |
---|
193 | $linesParsed = 0; |
---|
194 | |
---|
195 | for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) { |
---|
196 | |
---|
197 | // Make really sure we've got lines without trailing newlines. |
---|
198 | $line = trim($lineIterator->current(), "\r\n"); |
---|
199 | |
---|
200 | // Skip empty lines. |
---|
201 | if (empty($line)) { |
---|
202 | continue; |
---|
203 | } |
---|
204 | // If the first line contains column names, skip it. |
---|
205 | if ($skipLine) { |
---|
206 | $skipLine = FALSE; |
---|
207 | continue; |
---|
208 | } |
---|
209 | |
---|
210 | // The actual parser. explode() is unfortunately not suitable because the |
---|
211 | // delimiter might be located inside a quoted field, and that would break |
---|
212 | // the field and/or require additional effort to re-join the fields. |
---|
213 | $quoted = FALSE; |
---|
214 | $currentIndex = 0; |
---|
215 | $currentField = ''; |
---|
216 | $fields = array(); |
---|
217 | |
---|
218 | while ($currentIndex <= strlen($line)) { |
---|
219 | if ($quoted) { |
---|
220 | $nextQuoteIndex = strpos($line, '"', $currentIndex); |
---|
221 | |
---|
222 | if ($nextQuoteIndex === FALSE) { |
---|
223 | // There's a line break before the quote is closed, so fetch the |
---|
224 | // next line and start from there. |
---|
225 | $currentField .= substr($line, $currentIndex); |
---|
226 | $lineIterator->next(); |
---|
227 | |
---|
228 | if (!$lineIterator->valid()) { |
---|
229 | // Whoa, an unclosed quote! Well whatever, let's just ignore |
---|
230 | // that shortcoming and record it nevertheless. |
---|
231 | $fields[] = $currentField; |
---|
232 | break; |
---|
233 | } |
---|
234 | // Ok, so, on with fetching the next line, as mentioned above. |
---|
235 | $currentField .= "\n"; |
---|
236 | $line = trim($lineIterator->current(), "\r\n"); |
---|
237 | $currentIndex = 0; |
---|
238 | continue; |
---|
239 | } |
---|
240 | |
---|
241 | // There's actually another quote in this line... |
---|
242 | // find out whether it's escaped or not. |
---|
243 | $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex); |
---|
244 | |
---|
245 | if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') { |
---|
246 | // Escaped quote, add a single one to the field and proceed quoted. |
---|
247 | $currentField .= '"'; |
---|
248 | $currentIndex = $nextQuoteIndex + 2; |
---|
249 | } |
---|
250 | else { |
---|
251 | // End of the quoted section, close the quote and let the |
---|
252 | // $quoted == FALSE block finalize the field. |
---|
253 | $quoted = FALSE; |
---|
254 | $currentIndex = $nextQuoteIndex + 1; |
---|
255 | } |
---|
256 | } |
---|
257 | else { // $quoted == FALSE |
---|
258 | // First, let's find out where the next character of interest is. |
---|
259 | $nextQuoteIndex = strpos($line, '"', $currentIndex); |
---|
260 | $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex); |
---|
261 | |
---|
262 | if ($nextQuoteIndex === FALSE) { |
---|
263 | $nextIndex = $nextDelimiterIndex; |
---|
264 | } |
---|
265 | elseif ($nextDelimiterIndex === FALSE) { |
---|
266 | $nextIndex = $nextQuoteIndex; |
---|
267 | } |
---|
268 | else { |
---|
269 | $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex); |
---|
270 | } |
---|
271 | |
---|
272 | if ($nextIndex === FALSE) { |
---|
273 | // This line is done, add the rest of it as last field. |
---|
274 | $currentField .= substr($line, $currentIndex); |
---|
275 | $fields[] = $currentField; |
---|
276 | break; |
---|
277 | } |
---|
278 | elseif ($line[$nextIndex] === $this->delimiter[0]) { |
---|
279 | $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex; |
---|
280 | $currentField .= substr($line, $currentIndex, $length); |
---|
281 | $fields[] = $currentField; |
---|
282 | $currentField = ''; |
---|
283 | $currentIndex += $length + 1; |
---|
284 | // Continue with the next field. |
---|
285 | } |
---|
286 | else { // $line[$nextIndex] == '"' |
---|
287 | $quoted = TRUE; |
---|
288 | $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex); |
---|
289 | $currentIndex = $nextIndex + 1; |
---|
290 | // Continue this field in the $quoted == TRUE block. |
---|
291 | } |
---|
292 | } |
---|
293 | } |
---|
294 | // End of CSV parser. We've now got all the fields of the line as strings |
---|
295 | // in the $fields array. |
---|
296 | |
---|
297 | if (empty($this->columnNames)) { |
---|
298 | $row = $fields; |
---|
299 | } |
---|
300 | else { |
---|
301 | $row = array(); |
---|
302 | foreach ($this->columnNames as $columnName) { |
---|
303 | $field = array_shift($fields); |
---|
304 | $row[$columnName] = isset($field) ? $field : ''; |
---|
305 | } |
---|
306 | } |
---|
307 | $rows[] = $row; |
---|
308 | |
---|
309 | // Quit parsing if timeout has been reached or requested lines have been |
---|
310 | // reached. |
---|
311 | if (!empty($maxTime) && microtime() > $maxTime) { |
---|
312 | $this->timeoutReached = TRUE; |
---|
313 | $this->lastLinePos = $lineIterator->currentPos(); |
---|
314 | break; |
---|
315 | } |
---|
316 | $linesParsed++; |
---|
317 | if ($this->lineLimit && $linesParsed >= $this->lineLimit) { |
---|
318 | $this->lastLinePos = $lineIterator->currentPos(); |
---|
319 | break; |
---|
320 | } |
---|
321 | } |
---|
322 | return $rows; |
---|
323 | } |
---|
324 | } |
---|