LCOV - code coverage report
Current view: top level - lib/Support - TrigramIndex.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 38 38 100.0 %
Date: 2018-10-20 13:21:21 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // TrigramIndex implements a heuristic for SpecialCaseList that allows to
      11             : // filter out ~99% incoming queries when all regular expressions in the
      12             : // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
      13             : // complicated, the check is defeated and it will always pass the queries to a
      14             : // full regex.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : 
      18             : #include "llvm/Support/TrigramIndex.h"
      19             : #include "llvm/ADT/SmallVector.h"
      20             : 
      21             : #include <set>
      22             : #include <string>
      23             : #include <unordered_map>
      24             : 
      25             : using namespace llvm;
      26             : 
      27             : static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
      28             : 
      29             : static bool isAdvancedMetachar(unsigned Char) {
      30        9825 :   return strchr(RegexAdvancedMetachars, Char) != nullptr;
      31             : }
      32             : 
      33         795 : void TrigramIndex::insert(std::string Regex) {
      34        1022 :   if (Defeated) return;
      35             :   std::set<unsigned> Was;
      36         793 :   unsigned Cnt = 0;
      37         793 :   unsigned Tri = 0;
      38             :   unsigned Len = 0;
      39             :   bool Escaped = false;
      40       10738 :   for (unsigned Char : Regex) {
      41        9953 :     if (!Escaped) {
      42             :       // Regular expressions allow escaping symbols by preceding it with '\'.
      43        9889 :       if (Char == '\\') {
      44             :         Escaped = true;
      45             :         continue;
      46             :       }
      47        9825 :       if (isAdvancedMetachar(Char)) {
      48             :         // This is a more complicated regex than we can handle here.
      49           6 :         Defeated = true;
      50           6 :         return;
      51             :       }
      52        9819 :       if (Char == '.' || Char == '*') {
      53        1371 :         Tri = 0;
      54             :         Len = 0;
      55        1371 :         continue;
      56             :       }
      57             :     }
      58        8512 :     if (Escaped && Char >= '1' && Char <= '9') {
      59           2 :       Defeated = true;
      60           2 :       return;
      61             :     }
      62             :     // We have already handled escaping and can reset the flag.
      63             :     Escaped = false;
      64        8510 :     Tri = ((Tri << 8) + Char) & 0xFFFFFF;
      65        8510 :     Len++;
      66        8510 :     if (Len < 3)
      67             :       continue;
      68             :     // We don't want the index to grow too much for the popular trigrams,
      69             :     // as they are weak signals. It's ok to still require them for the
      70             :     // rules we have already processed. It's just a small additional
      71             :     // computational cost.
      72        6958 :     if (Index[Tri].size() >= 4)
      73             :       continue;
      74        6949 :     Cnt++;
      75             :     if (!Was.count(Tri)) {
      76             :       // Adding the current rule to the index.
      77       13860 :       Index[Tri].push_back(Counts.size());
      78             :       Was.insert(Tri);
      79             :     }
      80             :   }
      81         785 :   if (!Cnt) {
      82             :     // This rule does not have remarkable trigrams to rely on.
      83             :     // We have to always call the full regex chain.
      84         219 :     Defeated = true;
      85         219 :     return;
      86             :   }
      87         566 :   Counts.push_back(Cnt);
      88             : }
      89             : 
      90        4306 : bool TrigramIndex::isDefinitelyOut(StringRef Query) const {
      91        4306 :   if (Defeated)
      92             :     return false;
      93        3286 :   std::vector<unsigned> CurCounts(Counts.size());
      94        1643 :   unsigned Tri = 0;
      95       44112 :   for (size_t I = 0; I < Query.size(); I++) {
      96       42701 :     Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF;
      97       42701 :     if (I < 2)
      98             :       continue;
      99             :     const auto &II = Index.find(Tri);
     100       39424 :     if (II == Index.end())
     101             :       continue;
     102       15298 :     for (size_t J : II->second) {
     103        7812 :       CurCounts[J]++;
     104             :       // If we have reached a desired limit, we have to look at the query
     105             :       // more closely by running a full regex.
     106       15624 :       if (CurCounts[J] >= Counts[J])
     107             :         return false;
     108             :     }
     109             :   }
     110             :   return true;
     111             : }

Generated by: LCOV version 1.13