regexp.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
00005  *  Copyright (C) 2003,2004 Apple Computer, Inc.
00006  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
00007  *
00008  *  This library is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU Lesser General Public
00010  *  License as published by the Free Software Foundation; either
00011  *  version 2 of the License, or (at your option) any later version.
00012  *
00013  *  This library is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  *  Lesser General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU Lesser General Public
00019  *  License along with this library; if not, write to the Free Software
00020  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00021  *
00022  */
00023 
00024 #include "regexp.h"
00025 
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 
00032 using namespace KJS;
00033 
00034 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00035 
00036 RegExp::RegExp(const UString &p, int f)
00037   : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00038 {
00039   // Determine whether libpcre has unicode support if need be..
00040 #ifdef PCRE_CONFIG_UTF8
00041   if (utf8Support == Unknown) {
00042     int supported;
00043     pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00044     utf8Support = supported ? Supported : Unsupported;
00045   }
00046 #endif
00047 
00048   nrSubPatterns = 0; // determined in match() with POSIX regex.
00049 
00050   // JS regexps can contain Unicode escape sequences (\uxxxx) which
00051   // are rather uncommon elsewhere. As our regexp libs don't understand
00052   // them we do the unescaping ourselves internally.
00053   // Also make sure to expand out any nulls as pcre_compile 
00054   // expects null termination..
00055   UString intern;
00056   const char* const nil = "\\x00";
00057   if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00058     bool escape = false;
00059     for (int i = 0; i < p.size(); ++i) {
00060       UChar c = p[i];
00061       if (escape) {
00062         escape = false;
00063         // we only care about \uxxxx
00064         if (c == 'u' && i + 4 < p.size()) {
00065           int c0 = p[i+1].unicode();
00066           int c1 = p[i+2].unicode();
00067           int c2 = p[i+3].unicode();
00068           int c3 = p[i+4].unicode();
00069           if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00070               Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00071             c = Lexer::convertUnicode(c0, c1, c2, c3);
00072             if (c.unicode() == 0) {
00073                 // Make sure to encode 0, to avoid terminating the string
00074                 intern += UString(nil);
00075             } else {
00076                 intern += UString(&c, 1);
00077             }
00078             i += 4;
00079             continue;
00080           }
00081         }
00082         intern += UString('\\');
00083         intern += UString(&c, 1);
00084       } else {
00085         if (c == '\\')
00086           escape = true;
00087         else if (c == '\0')
00088           intern += UString(nil);
00089         else
00090           intern += UString(&c, 1);
00091       }
00092     }
00093   } else {
00094     intern = p;
00095   }
00096 
00097 #ifdef HAVE_PCREPOSIX
00098   int pcreflags = 0;
00099   const char *perrormsg;
00100   int errorOffset;
00101 
00102   if (flgs & IgnoreCase)
00103     pcreflags |= PCRE_CASELESS;
00104 
00105   if (flgs & Multiline)
00106     pcreflags |= PCRE_MULTILINE;
00107 
00108   if (utf8Support == Supported)
00109     pcreflags |= PCRE_UTF8;
00110 
00111   // Fill our buffer with an encoded version, whether utf-8, or, 
00112   // if PCRE is incapable, truncated.
00113   prepareMatch(intern);
00114 
00115   pcregex = pcre_compile(buffer, pcreflags,
00116              &perrormsg, &errorOffset, NULL);
00117   doneMatch(); // Cleanup buffers
00118   if (!pcregex) {
00119 #ifndef NDEBUG
00120     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00121 #endif
00122     valid = false;
00123     return;
00124   }
00125 
00126 #ifdef PCRE_INFO_CAPTURECOUNT
00127   // Get number of subpatterns that will be returned
00128   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00129   if (rc != 0)
00130 #endif
00131     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
00132 
00133 #else /* HAVE_PCREPOSIX */
00134 
00135   int regflags = 0;
00136 #ifdef REG_EXTENDED
00137   regflags |= REG_EXTENDED;
00138 #endif
00139 #ifdef REG_ICASE
00140   if ( f & IgnoreCase )
00141     regflags |= REG_ICASE;
00142 #endif
00143 
00144   //NOTE: Multiline is not feasible with POSIX regex.
00145   //if ( f & Multiline )
00146   //    ;
00147   // Note: the Global flag is already handled by RegExpProtoFunc::execute
00148 
00149   int errorCode = regcomp(&preg, intern.ascii(), regflags);
00150   if (errorCode != 0) {
00151 #ifndef NDEBUG
00152     char errorMessage[80];
00153     regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00154     fprintf(stderr, "KJS: regcomp failed with '%s'", errorMessage);
00155 #endif
00156     valid = false;
00157   }
00158 #endif
00159 }
00160 
00161 RegExp::~RegExp()
00162 {
00163   doneMatch(); // Be 100% sure buffers are freed
00164 #ifdef HAVE_PCREPOSIX
00165   if (pcregex)
00166     pcre_free(pcregex);
00167 #else
00168   /* TODO: is this really okay after an error ? */
00169   regfree(&preg);
00170 #endif
00171 }
00172 
00173 void RegExp::prepareUtf8(const UString& s)
00174 {
00175   // Allocate a buffer big enough to hold all the characters plus \0
00176   const int length = s.size();
00177   buffer = new char[length * 3 + 1];
00178 
00179   // Also create buffer for positions. We need one extra character in there,
00180   // even past the \0 since the non-empty handling may jump one past the end
00181   originalPos = new int[length * 3 + 2];
00182 
00183   // Convert to runs of 8-bit characters, and generate indeces
00184   // Note that we do NOT combine surrogate pairs here, as 
00185   // regexps operate on them as separate characters
00186   char *p      = buffer;
00187   int  *posOut = originalPos;
00188   const UChar *d = s.data();
00189   for (int i = 0; i != length; ++i) {
00190     unsigned short c = d[i].unicode();
00191 
00192     int sequenceLen;
00193     if (c < 0x80) {
00194       *p++ = (char)c;
00195       sequenceLen = 1;
00196     } else if (c < 0x800) {
00197       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
00198       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00199       sequenceLen = 2;
00200     } else {
00201       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
00202       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
00203       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00204       sequenceLen = 3;
00205     }
00206 
00207     while (sequenceLen > 0) {
00208       *posOut = i;
00209       ++posOut;
00210       --sequenceLen;
00211     }
00212   }
00213 
00214   bufferSize = p - buffer;
00215 
00216   *p++ = '\0';
00217 
00218   // Record positions for \0, and the fictional character after that.
00219   *posOut     = length;
00220   *(posOut+1) = length+1;
00221 }
00222 
00223 void RegExp::prepareASCII (const UString& s)
00224 {
00225   originalPos = 0;
00226 
00227   // Best-effort attempt to get something done
00228   // when we don't have utf 8 available -- use 
00229   // truncated version, and pray for the best 
00230   CString truncated = s.cstring();
00231   buffer = new char[truncated.size() + 1];
00232   memcpy(buffer, truncated.c_str(), truncated.size());
00233   buffer[truncated.size()] = '\0'; // For _compile use
00234   bufferSize = truncated.size();
00235 }
00236 
00237 void RegExp::prepareMatch(const UString &s)
00238 {
00239   delete[] originalPos; // Just to be sure..
00240   delete[] buffer;
00241   if (utf8Support == Supported)
00242     prepareUtf8(s);
00243   else
00244     prepareASCII(s);
00245 
00246 #ifndef NDEBUG
00247   originalS = s;
00248 #endif
00249 }
00250 
00251 void RegExp::doneMatch() 
00252 {
00253   delete[] originalPos; originalPos = 0;
00254   delete[] buffer;      buffer      = 0;
00255 }
00256 
00257 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00258 {
00259 #ifndef NDEBUG
00260   assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
00261 #endif
00262   assert(valid);
00263 
00264   if (i < 0)
00265     i = 0;
00266   if (ovector)
00267     *ovector = 0L;
00268   int dummyPos;
00269   if (!pos)
00270     pos = &dummyPos;
00271   *pos = -1;
00272   if (i > s.size() || s.isNull())
00273     return UString::null;
00274 
00275 #ifdef HAVE_PCREPOSIX
00276   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
00277   if (ovector) *ovector = new int[ovecsize];
00278   if (!pcregex)
00279     return UString::null;
00280 
00281   int startPos;
00282   int nextPos;
00283 
00284   if (utf8Support == Supported) {
00285     startPos = i;
00286     while (originalPos[startPos] < i)
00287       ++startPos;
00288 
00289     nextPos = startPos;
00290     while (originalPos[nextPos] < (i + 1))
00291       ++nextPos;
00292   } else {
00293     startPos = i;
00294     nextPos  = i + 1;
00295   }
00296 
00297   if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00298                 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest
00299                 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00300   {
00301     // Failed to match.
00302     if ((flgs & Global) && m_notEmpty && ovector)
00303     {
00304       // We set m_notEmpty ourselves, to look for a non-empty match
00305       // (see man pcretest or pcretest.c for details).
00306       // So we don't stop here, we want to try again at i+1.
00307 #ifdef KJS_VERBOSE
00308       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00309 #endif
00310       m_notEmpty = 0;
00311       if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0,
00312                     ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00313         return UString::null;
00314     }
00315     else // done
00316       return UString::null;
00317   }
00318 
00319   // Got a match, proceed with it.
00320   // But fix up the ovector if need be..
00321   if (ovector && originalPos) {
00322     for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00323       if ((*ovector)[c] != -1)
00324         (*ovector)[c] = originalPos[(*ovector)[c]];
00325     }
00326   }
00327 
00328   if (!ovector)
00329     return UString::null; // don't rely on the return value if you pass ovector==0
00330 #else
00331   const uint maxMatch = 10;
00332   regmatch_t rmatch[maxMatch];
00333 
00334   char *str = strdup(s.ascii()); // TODO: why ???
00335   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00336     free(str);
00337     return UString::null;
00338   }
00339   free(str);
00340 
00341   if (!ovector) {
00342     *pos = rmatch[0].rm_so + i;
00343     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00344   }
00345 
00346   // map rmatch array to ovector used in PCRE case
00347   nrSubPatterns = 0;
00348   for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00349     nrSubPatterns++;
00350     // if the nonEmpty flag is set, return a failed match if any of the
00351     // subMatches happens to be an empty string.
00352     if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo) 
00353       return UString::null;
00354   }
00355   // Allow an ovector slot to return the (failed) match result.
00356   if (nrSubPatterns == 0) nrSubPatterns = 1;
00357   
00358   int ovecsize = (nrSubPatterns)*3; // see above
00359   *ovector = new int[ovecsize];
00360   for (uint j = 0; j < nrSubPatterns; j++) {
00361       (*ovector)[2*j] = rmatch[j].rm_so + i;
00362       (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00363   }
00364 #endif
00365 
00366   *pos = (*ovector)[0];
00367   if ( *pos == (*ovector)[1] && (flgs & Global) )
00368   {
00369     // empty match, next try will be with m_notEmpty=true
00370     m_notEmpty=true;
00371   }
00372   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00373 }
00374 
00375 #if 0 // unused
00376 bool RegExp::test(const UString &s, int)
00377 {
00378 #ifdef HAVE_PCREPOSIX
00379   int ovector[300];
00380   CString buffer(s.cstring());
00381 
00382   if (s.isNull() ||
00383       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00384         0, ovector, 300) == PCRE_ERROR_NOMATCH)
00385     return false;
00386   else
00387     return true;
00388 
00389 #else
00390 
00391   char *str = strdup(s.ascii());
00392   int r = regexec(&preg, str, 0, 0, 0);
00393   free(str);
00394 
00395   return r == 0;
00396 #endif
00397 }
00398 #endif
KDE Home | KDE Accessibility Home | Description of Access Keys