00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "regexp.h"
00025
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031
00032 using namespace KJS;
00033
00034 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00035
00036 RegExp::RegExp(const UString &p, int f)
00037 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00038 {
00039
00040 #ifdef PCRE_CONFIG_UTF8
00041 if (utf8Support == Unknown) {
00042 int supported;
00043 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00044 utf8Support = supported ? Supported : Unsupported;
00045 }
00046 #endif
00047
00048 nrSubPatterns = 0;
00049
00050
00051
00052
00053
00054
00055 UString intern;
00056 const char* const nil = "\\x00";
00057 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00058 bool escape = false;
00059 for (int i = 0; i < p.size(); ++i) {
00060 UChar c = p[i];
00061 if (escape) {
00062 escape = false;
00063
00064 if (c == 'u' && i + 4 < p.size()) {
00065 int c0 = p[i+1].unicode();
00066 int c1 = p[i+2].unicode();
00067 int c2 = p[i+3].unicode();
00068 int c3 = p[i+4].unicode();
00069 if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00070 Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00071 c = Lexer::convertUnicode(c0, c1, c2, c3);
00072 if (c.unicode() == 0) {
00073
00074 intern += UString(nil);
00075 } else {
00076 intern += UString(&c, 1);
00077 }
00078 i += 4;
00079 continue;
00080 }
00081 }
00082 intern += UString('\\');
00083 intern += UString(&c, 1);
00084 } else {
00085 if (c == '\\')
00086 escape = true;
00087 else if (c == '\0')
00088 intern += UString(nil);
00089 else
00090 intern += UString(&c, 1);
00091 }
00092 }
00093 } else {
00094 intern = p;
00095 }
00096
00097 #ifdef HAVE_PCREPOSIX
00098 int pcreflags = 0;
00099 const char *perrormsg;
00100 int errorOffset;
00101
00102 if (flgs & IgnoreCase)
00103 pcreflags |= PCRE_CASELESS;
00104
00105 if (flgs & Multiline)
00106 pcreflags |= PCRE_MULTILINE;
00107
00108 if (utf8Support == Supported)
00109 pcreflags |= PCRE_UTF8;
00110
00111
00112
00113 prepareMatch(intern);
00114
00115 pcregex = pcre_compile(buffer, pcreflags,
00116 &perrormsg, &errorOffset, NULL);
00117 doneMatch();
00118 if (!pcregex) {
00119 #ifndef NDEBUG
00120 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00121 #endif
00122 valid = false;
00123 return;
00124 }
00125
00126 #ifdef PCRE_INFO_CAPTURECOUNT
00127
00128 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00129 if (rc != 0)
00130 #endif
00131 nrSubPatterns = 0;
00132
00133 #else
00134
00135 int regflags = 0;
00136 #ifdef REG_EXTENDED
00137 regflags |= REG_EXTENDED;
00138 #endif
00139 #ifdef REG_ICASE
00140 if ( f & IgnoreCase )
00141 regflags |= REG_ICASE;
00142 #endif
00143
00144
00145
00146
00147
00148
00149 int errorCode = regcomp(&preg, intern.ascii(), regflags);
00150 if (errorCode != 0) {
00151 #ifndef NDEBUG
00152 char errorMessage[80];
00153 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00154 fprintf(stderr, "KJS: regcomp failed with '%s'", errorMessage);
00155 #endif
00156 valid = false;
00157 }
00158 #endif
00159 }
00160
00161 RegExp::~RegExp()
00162 {
00163 doneMatch();
00164 #ifdef HAVE_PCREPOSIX
00165 if (pcregex)
00166 pcre_free(pcregex);
00167 #else
00168
00169 regfree(&preg);
00170 #endif
00171 }
00172
00173 void RegExp::prepareUtf8(const UString& s)
00174 {
00175
00176 const int length = s.size();
00177 buffer = new char[length * 3 + 1];
00178
00179
00180
00181 originalPos = new int[length * 3 + 2];
00182
00183
00184
00185
00186 char *p = buffer;
00187 int *posOut = originalPos;
00188 const UChar *d = s.data();
00189 for (int i = 0; i != length; ++i) {
00190 unsigned short c = d[i].unicode();
00191
00192 int sequenceLen;
00193 if (c < 0x80) {
00194 *p++ = (char)c;
00195 sequenceLen = 1;
00196 } else if (c < 0x800) {
00197 *p++ = (char)((c >> 6) | 0xC0);
00198 *p++ = (char)((c | 0x80) & 0xBF);
00199 sequenceLen = 2;
00200 } else {
00201 *p++ = (char)((c >> 12) | 0xE0);
00202 *p++ = (char)(((c >> 6) | 0x80) & 0xBF);
00203 *p++ = (char)((c | 0x80) & 0xBF);
00204 sequenceLen = 3;
00205 }
00206
00207 while (sequenceLen > 0) {
00208 *posOut = i;
00209 ++posOut;
00210 --sequenceLen;
00211 }
00212 }
00213
00214 bufferSize = p - buffer;
00215
00216 *p++ = '\0';
00217
00218
00219 *posOut = length;
00220 *(posOut+1) = length+1;
00221 }
00222
00223 void RegExp::prepareASCII (const UString& s)
00224 {
00225 originalPos = 0;
00226
00227
00228
00229
00230 CString truncated = s.cstring();
00231 buffer = new char[truncated.size() + 1];
00232 memcpy(buffer, truncated.c_str(), truncated.size());
00233 buffer[truncated.size()] = '\0';
00234 bufferSize = truncated.size();
00235 }
00236
00237 void RegExp::prepareMatch(const UString &s)
00238 {
00239 delete[] originalPos;
00240 delete[] buffer;
00241 if (utf8Support == Supported)
00242 prepareUtf8(s);
00243 else
00244 prepareASCII(s);
00245
00246 #ifndef NDEBUG
00247 originalS = s;
00248 #endif
00249 }
00250
00251 void RegExp::doneMatch()
00252 {
00253 delete[] originalPos; originalPos = 0;
00254 delete[] buffer; buffer = 0;
00255 }
00256
00257 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00258 {
00259 #ifndef NDEBUG
00260 assert(s.data() == originalS.data());
00261 #endif
00262 assert(valid);
00263
00264 if (i < 0)
00265 i = 0;
00266 if (ovector)
00267 *ovector = 0L;
00268 int dummyPos;
00269 if (!pos)
00270 pos = &dummyPos;
00271 *pos = -1;
00272 if (i > s.size() || s.isNull())
00273 return UString::null;
00274
00275 #ifdef HAVE_PCREPOSIX
00276 int ovecsize = (nrSubPatterns+1)*3;
00277 if (ovector) *ovector = new int[ovecsize];
00278 if (!pcregex)
00279 return UString::null;
00280
00281 int startPos;
00282 int nextPos;
00283
00284 if (utf8Support == Supported) {
00285 startPos = i;
00286 while (originalPos[startPos] < i)
00287 ++startPos;
00288
00289 nextPos = startPos;
00290 while (originalPos[nextPos] < (i + 1))
00291 ++nextPos;
00292 } else {
00293 startPos = i;
00294 nextPos = i + 1;
00295 }
00296
00297 if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00298 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0,
00299 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00300 {
00301
00302 if ((flgs & Global) && m_notEmpty && ovector)
00303 {
00304
00305
00306
00307 #ifdef KJS_VERBOSE
00308 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00309 #endif
00310 m_notEmpty = 0;
00311 if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0,
00312 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00313 return UString::null;
00314 }
00315 else
00316 return UString::null;
00317 }
00318
00319
00320
00321 if (ovector && originalPos) {
00322 for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00323 if ((*ovector)[c] != -1)
00324 (*ovector)[c] = originalPos[(*ovector)[c]];
00325 }
00326 }
00327
00328 if (!ovector)
00329 return UString::null;
00330 #else
00331 const uint maxMatch = 10;
00332 regmatch_t rmatch[maxMatch];
00333
00334 char *str = strdup(s.ascii());
00335 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00336 free(str);
00337 return UString::null;
00338 }
00339 free(str);
00340
00341 if (!ovector) {
00342 *pos = rmatch[0].rm_so + i;
00343 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00344 }
00345
00346
00347 nrSubPatterns = 0;
00348 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00349 nrSubPatterns++;
00350
00351
00352 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
00353 return UString::null;
00354 }
00355
00356 if (nrSubPatterns == 0) nrSubPatterns = 1;
00357
00358 int ovecsize = (nrSubPatterns)*3;
00359 *ovector = new int[ovecsize];
00360 for (uint j = 0; j < nrSubPatterns; j++) {
00361 (*ovector)[2*j] = rmatch[j].rm_so + i;
00362 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00363 }
00364 #endif
00365
00366 *pos = (*ovector)[0];
00367 if ( *pos == (*ovector)[1] && (flgs & Global) )
00368 {
00369
00370 m_notEmpty=true;
00371 }
00372 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00373 }
00374
00375 #if 0 // unused
00376 bool RegExp::test(const UString &s, int)
00377 {
00378 #ifdef HAVE_PCREPOSIX
00379 int ovector[300];
00380 CString buffer(s.cstring());
00381
00382 if (s.isNull() ||
00383 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00384 0, ovector, 300) == PCRE_ERROR_NOMATCH)
00385 return false;
00386 else
00387 return true;
00388
00389 #else
00390
00391 char *str = strdup(s.ascii());
00392 int r = regexec(&preg, str, 0, 0, 0);
00393 free(str);
00394
00395 return r == 0;
00396 #endif
00397 }
00398 #endif