Add parallel count heuristic

This commit is contained in:
Erin Power 2020-03-21 12:57:40 +01:00
parent 2a3f735260
commit a8afa67057
5 changed files with 88 additions and 41 deletions

1
.gitignore vendored
View File

@ -6,4 +6,5 @@ src/test
*.rustfmt
.DS_Store
.tokeirc
results.csv
node_modules

View File

@ -39,12 +39,11 @@ fi
cargo build --release
if [ $FULL = true ]; then
hyperfine -w 5 "target/release/tokei $input" \
hyperfine -w 10 --export-csv './results.csv' "target/release/tokei $input" \
"tokei $input" \
"scc $input" \
"loc $input" \
"cloc $input"
"loc $input" # \ "cloc $input"
else
hyperfine -w 10 -m 50 "target/release/tokei $input" \
hyperfine -w 5 "target/release/tokei $input" \
"tokei $input"
fi

View File

@ -16,6 +16,7 @@ use crate::{
use encoding_rs_io::DecodeReaderBytesBuilder;
use grep_searcher::LineIter;
use rayon::prelude::*;
use self::LanguageType::*;
@ -54,16 +55,66 @@ impl LanguageType {
text: A,
config: &Config,
) -> Stats {
let lines = LineIter::new(b'\n', text.as_ref());
let text = text.as_ref();
let lines = LineIter::new(b'\n', text);
let mut stats = Stats::new(path);
let syntax = SyntaxCounter::new(self);
if self.is_blank() {
let count = lines.count();
stats.lines = count;
stats.code = count;
stats
} else if let Some(end) = syntax
.shared
.important_syntax
.earliest_find(text)
.and_then(|m| {
// Get the position of the last line before the important
// syntax.
text[..=m.start()]
.into_iter()
.rev()
.position(|&c| c == b'\n')
.filter(|&p| p != 0)
.map(|p| m.start() - p)
})
{
let (skippable_text, rest) = text.split_at(end + 1);
let lines = LineIter::new(b'\n', skippable_text);
let is_fortran = syntax.shared.is_fortran;
let comments = syntax.shared.line_comments;
let (mut stats, (code, comments, blanks)) = rayon::join(
move || self.parse_lines(config, LineIter::new(b'\n', rest), stats, syntax),
move || {
lines
.par_bridge()
.map(|line| {
// FORTRAN has a rule where it only counts as a comment if it's the
// first character in the column, so removing starting whitespace
// could cause a miscount.
let line = if is_fortran { line } else { line.trim() };
trace!("{}", String::from_utf8_lossy(line));
if line.trim().is_empty() {
(0, 0, 1)
} else if comments.iter().any(|c| line.starts_with(c.as_bytes())) {
(0, 1, 0)
} else {
(1, 0, 0)
}
})
.reduce(|| (0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2))
},
);
stats.code += code;
stats.comments += comments;
stats.blanks += blanks;
stats
} else {
self.parse_lines(config, lines, stats)
self.parse_lines(config, lines, stats, syntax)
}
}
@ -73,9 +124,8 @@ impl LanguageType {
config: &Config,
lines: impl IntoIterator<Item = &'a [u8]>,
mut stats: Stats,
mut syntax: SyntaxCounter,
) -> Stats {
let mut syntax = SyntaxCounter::new(self);
for line in lines {
// FORTRAN has a rule where it only counts as a comment if it's the
// first character in the column, so removing starting whitespace
@ -161,12 +211,7 @@ impl LanguageType {
|| (
// If we're currently in a comment or we just ended
// with one.
syntax
.shared
.any_comments
.earliest_find(line)
.map_or(false, |e| e.start() == 0)
&& syntax.quote.is_none()
syntax.shared.any_comments.is_match(line) && syntax.quote.is_none()
)
|| ((
// If we're currently in a doc string or we just ended

View File

@ -1,6 +1,6 @@
use std::sync::Arc;
use aho_corasick::AhoCorasick;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use dashmap::DashMap;
use log::Level::Trace;
@ -29,8 +29,8 @@ pub(crate) struct SyntaxCounter {
pub(crate) struct SharedMatchers {
pub allows_nested: bool,
pub doc_quotes: &'static [(&'static str, &'static str)],
pub important_syntax: AhoCorasick,
pub any_comments: AhoCorasick,
pub important_syntax: AhoCorasick<u16>,
pub any_comments: AhoCorasick<u16>,
pub is_fortran: bool,
pub line_comments: &'static [&'static str],
pub multi_line_comments: &'static [(&'static str, &'static str)],
@ -52,12 +52,22 @@ impl SharedMatchers {
}
pub fn init(language: LanguageType) -> Self {
fn init_corasick(pattern: &[&'static str], anchored: bool) -> AhoCorasick<u16> {
let mut builder = AhoCorasickBuilder::new();
builder
.anchored(anchored)
.byte_classes(false)
.dfa(true)
.prefilter(true);
builder.build_with_size(pattern).unwrap()
}
Self {
allows_nested: language.allows_nested(),
doc_quotes: language.doc_quotes(),
is_fortran: language.is_fortran(),
important_syntax: AhoCorasick::new_auto_configured(language.important_syntax()),
any_comments: AhoCorasick::new_auto_configured(language.start_any_comments()),
important_syntax: init_corasick(language.important_syntax(), false),
any_comments: init_corasick(language.start_any_comments(), true),
line_comments: language.line_comments(),
multi_line_comments: language.multi_line_comments(),
nested_comments: language.nested_comments(),

View File

@ -20,6 +20,7 @@ pub fn get_all_files<A: AsRef<Path>>(
languages: &mut BTreeMap<LanguageType, Language>,
config: &Config,
) {
let languages = parking_lot::Mutex::new(languages);
let (tx, rx) = crossbeam_channel::unbounded();
let mut paths = paths.iter();
@ -88,31 +89,22 @@ pub fn get_all_files<A: AsRef<Path>>(
let types: Option<&[LanguageType]> = config.types.as_ref().map(|v| &**v);
let iter = rx
.into_iter()
rx.into_iter()
.par_bridge()
.filter_map(|e| LanguageType::from_path(e.path(), &config).map(|l| (e, l)))
.filter(|(_, l)| types.map(|t| t.contains(l)).unwrap_or(true))
.map(|(entry, language)| {
language
.parse(entry.into_path(), &config)
.map(|stats| (language, Some(stats)))
.unwrap_or_else(|(e, path)| {
error!("Error reading {}:\n{}", path.display(), e);
(language, None)
})
.filter(|(_, l)| types.map_or(true, |t| t.contains(l)))
.for_each(|(entry, language)| {
let result = language.parse(entry.into_path(), &config);
let mut lock = languages.lock();
let entry = lock.entry(language).or_insert_with(Language::new);
match result {
Ok(stats) => entry.add_stat(stats),
Err((error, path)) => {
entry.mark_inaccurate();
error!("Error reading {}:\n{}", path.display(), error);
}
}
})
.collect::<Vec<_>>();
for (language_type, stats) in iter {
let entry = languages.entry(language_type).or_insert_with(Language::new);
if let Some(stats) = stats {
entry.add_stat(stats);
} else {
entry.mark_inaccurate();
}
}
}
pub(crate) fn get_extension(path: &Path) -> Option<String> {