на большинстве высокоуровневых языков это вопрос десятка строк.
Смеха ради набросал реализации на двух языках, которые были под рукой:
python:
import os
import re
from collections import Counter
PROJECT_DIRECTORY = '.'
ignoreName=set(['__pycache__', '.git', '.gitignore', 'README.md'])
ignoreExt=set(['wav', 'jpg', 'png'])
CURRENT_DIRECTORY = os.path.dirname(__file__)
splitToLiteralsReg = re.compile(r"[^a-zA-Zа-яА-Я0-9\_]+")
target_path = os.path.normpath(os.path.relpath(os.path.join(CURRENT_DIRECTORY, PROJECT_DIRECTORY), os.getcwd()))
def shouldIgnore(root, name):
if name in ignoreName:
return True
if name.split('.')[-1] in ignoreExt:
return True
return False
counter = Counter()
for root, dirs, files in os.walk(target_path):
dirs[:] = [
foldername for foldername in dirs
if not shouldIgnore(root, foldername)
]
for filename in files:
if not shouldIgnore(root, filename) :
fullpath = os.path.join(root, filename)
print('чтение', fullpath)
with open(fullpath) as file:
for line in file:
#counter.update(line.split(' '))
counter.update(filter(lambda literal: literal, splitToLiteralsReg.split(line)))
index = 0
for literal, count in counter.most_common():
index+=1
print(f'{index}. "{literal}" {count} повторений')
node js:
const fs = require('fs')
const path = require('path')
const PROJECT_DIRECTORY = '.'
const ignoreName=new Set(['__pycache__', '.git', '.gitignore', 'README.md'])
const ignoreExt=new Set(['wav', 'jpg', 'png'])
const CURRENT_DIRECTORY = __dirname
const splitToLiteralsReg = /[^a-zA-Zа-яА-Я0-9\_]+/gim
const target_path = path.join(CURRENT_DIRECTORY, PROJECT_DIRECTORY)
function shouldIgnore(root, name){
if(ignoreName.has(name))
return true
if(ignoreExt.has(name.split('.').slice(-1)[0]))
return true
return false
}
function Counter(){
this.literals = {}
}
Counter.prototype.learnLiteral = function learnLiteral(literal){
this.literals[literal] = 1 + (this.literals[literal] || 0)
}
Counter.prototype.update = function update(iterable){
for(let curr of iterable)
this.learnLiteral(curr)
}
Counter.prototype.most_common = function most_common(){
return Object.entries(this.literals).sort((a,b) => b[1] - a[1])
}
const counter = new Counter()
function walk(root, callback){
const content = fs.readdirSync(root).reduce(([dirs, files], name) => {
(fs.lstatSync(path.join(root,name)).isDirectory() ? dirs : files).push(name)
return [dirs, files]
}, [[], []]);
callback(root, ...content)
for(const dirname of content[0]){
walk(path.join(root, dirname), callback)
}
}
walk(target_path, (root, dirs, files) =>{
const filtratedDirs = dirs.filter(name => !shouldIgnore(root, name))
dirs.splice(0)
dirs.push(...filtratedDirs)
for(const filename of files){
if(!shouldIgnore(root, filename)){
const fullName=path.join(root, filename)
console.log('чтение' ,fullName)
const text = fs.readFileSync(fullName, {encoding :'utf-8'})
counter.update(text.split(splitToLiteralsReg))
}
}
})
let index = 0
for(const [literal, count] of counter.most_common()){
index++;
console.log(`${index}. "${literal}" ${count} повторений`)
}
Первым писал скрипт на питоне, а на js просто повторил архитектуру питоновского скрипта: получилось странно, но вполне работоспособно