LCOV - code coverage report
Current view: top level - lib/Support - TrigramIndex.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 51 51 100.0 %
Date: 2017-09-14 15:23:50 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // TrigramIndex implements a heuristic for SpecialCaseList that allows to
      11             : // filter out ~99% incoming queries when all regular expressions in the
      12             : // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
      13             : // complicated, the check is defeated and it will always pass the queries to a
      14             : // full regex.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : 
      18             : #include "llvm/Support/TrigramIndex.h"
      19             : #include "llvm/ADT/SmallVector.h"
      20             : 
      21             : #include <set>
      22             : #include <string>
      23             : #include <unordered_map>
      24             : 
      25             : using namespace llvm;
      26             : 
      27             : static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
      28             : 
      29             : static bool isAdvancedMetachar(unsigned Char) {
      30       18588 :   return strchr(RegexAdvancedMetachars, Char) != nullptr;
      31             : }
      32             : 
      33         534 : void TrigramIndex::insert(std::string Regex) {
      34         550 :   if (Defeated) return;
      35        1050 :   std::set<unsigned> Was;
      36         532 :   unsigned Cnt = 0;
      37         532 :   unsigned Tri = 0;
      38         532 :   unsigned Len = 0;
      39         532 :   bool Escaped = false;
      40       11544 :   for (unsigned Char : Regex) {
      41        9422 :     if (!Escaped) {
      42             :       // Regular expressions allow escaping symbols by preceding it with '\'.
      43        9422 :       if (Char == '\\') {
      44          64 :         Escaped = true;
      45          64 :         continue;
      46             :       }
      47        9294 :       if (isAdvancedMetachar(Char)) {
      48             :         // This is a more complicated regex than we can handle here.
      49           4 :         Defeated = true;
      50           4 :         return;
      51             :       }
      52       10262 :       if (Char == '.' || Char == '*') {
      53         972 :         Tri = 0;
      54         972 :         Len = 0;
      55         972 :         continue;
      56             :       }
      57             :     }
      58        8382 :     if (Escaped && Char >= '1' && Char <= '9') {
      59           2 :       Defeated = true;
      60           2 :       return;
      61             :     }
      62             :     // We have already handled escaping and can reset the flag.
      63        8380 :     Escaped = false;
      64        8380 :     Tri = ((Tri << 8) + Char) & 0xFFFFFF;
      65        8380 :     Len++;
      66        8380 :     if (Len < 3)
      67        1372 :       continue;
      68             :     // We don't want the index to grow too much for the popular trigrams,
      69             :     // as they are weak signals. It's ok to still require them for the
      70             :     // rules we have already processed. It's just a small additional
      71             :     // computational cost.
      72       21024 :     if (Index[Tri].size() >= 4)
      73           9 :       continue;
      74        6999 :     Cnt++;
      75        6980 :     if (!Was.count(Tri)) {
      76             :       // Adding the current rule to the index.
      77       20940 :       Index[Tri].push_back(Counts.size());
      78             :       Was.insert(Tri);
      79             :     }
      80             :   }
      81         526 :   if (!Cnt) {
      82             :     // This rule does not have remarkable trigrams to rely on.
      83             :     // We have to always call the full regex chain.
      84           8 :     Defeated = true;
      85           8 :     return;
      86             :   }
      87         518 :   Counts.push_back(Cnt);
      88             : }
      89             : 
      90         968 : bool TrigramIndex::isDefinitelyOut(StringRef Query) const {
      91         968 :   if (Defeated)
      92             :     return false;
      93        2862 :   std::vector<unsigned> CurCounts(Counts.size());
      94         954 :   unsigned Tri = 0;
      95       69854 :   for (size_t I = 0; I < Query.size(); I++) {
      96       68300 :     Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF;
      97       34150 :     if (I < 2)
      98        1903 :       continue;
      99       64494 :     const auto &II = Index.find(Tri);
     100       64494 :     if (II == Index.end())
     101       24651 :       continue;
     102       30301 :     for (size_t J : II->second) {
     103       15380 :       CurCounts[J]++;
     104             :       // If we have reached a desired limit, we have to look at the query
     105             :       // more closely by running a full regex.
     106       15380 :       if (CurCounts[J] >= Counts[J])
     107             :         return false;
     108             :     }
     109             :   }
     110             :   return true;
     111             : }

Generated by: LCOV version 1.13