Ability to scan single archive (e.g. compressed .pdf) -> extract useful description

This commit is contained in:
Bryan Ashby 2017-05-21 11:45:50 -06:00
parent 019596d709
commit d0e2d41c00
5 changed files with 176 additions and 83 deletions

View File

@ -280,7 +280,10 @@ function getDefaultConfig() {
cmd : 'exiftool', cmd : 'exiftool',
args : [ args : [
'-charset', 'utf8', '{filePath}', '-charset', 'utf8', '{filePath}',
'--directory', '--filepermissions', '--exiftoolversion', '--filename', '--filesize', '--filemodifydate', '--fileaccessdate', '--fileinodechangedate' // exclude the following:
'--directory', '--filepermissions', '--exiftoolversion', '--filename', '--filesize',
'--filemodifydate', '--fileaccessdate', '--fileinodechangedate', '--createdate', '--modifydate',
'--metadatadate', '--xmptoolkit'
] ]
} }
}, },

View File

@ -22,7 +22,7 @@ const crypto = require('crypto');
const paths = require('path'); const paths = require('path');
const temptmp = require('temptmp').createTrackedSession('file_area'); const temptmp = require('temptmp').createTrackedSession('file_area');
const iconv = require('iconv-lite'); const iconv = require('iconv-lite');
const exec = require('child_process').exec; const execFile = require('child_process').execFile;
const moment = require('moment'); const moment = require('moment');
exports.isInternalArea = isInternalArea; exports.isInternalArea = isInternalArea;
@ -262,48 +262,16 @@ function logDebug(obj, msg) {
} }
} }
function populateFileEntryWithArchive(fileEntry, filePath, stepInfo, iterator, cb) { function extractAndProcessDescFiles(fileEntry, filePath, archiveEntries, cb) {
const archiveUtil = ArchiveUtil.getInstance();
const archiveType = fileEntry.meta.archive_type; // we set this previous to populateFileEntryWithArchive()
async.waterfall( async.waterfall(
[ [
function getArchiveFileList(callback) { function extractDescFiles(callback) {
stepInfo.step = 'archive_list_start';
iterator(err => {
if(err) {
return callback(err);
}
archiveUtil.listEntries(filePath, archiveType, (err, entries) => {
if(err) {
stepInfo.step = 'archive_list_failed';
} else {
stepInfo.step = 'archive_list_finish';
stepInfo.archiveEntries = entries || [];
}
iterator(iterErr => {
return callback( iterErr, entries || [] ); // ignore original |err| here
});
});
});
},
function processDescFilesStart(entries, callback) {
stepInfo.step = 'desc_files_start';
iterator(err => {
return callback(err, entries);
});
},
function extractDescFiles(entries, callback) {
// :TODO: would be nice if these RegExp's were cached // :TODO: would be nice if these RegExp's were cached
// :TODO: this is long winded... // :TODO: this is long winded...
const extractList = []; const extractList = [];
const shortDescFile = entries.find( e => { const shortDescFile = archiveEntries.find( e => {
return Config.fileBase.fileNamePatterns.desc.find( pat => new RegExp(pat, 'i').test(e.fileName) ); return Config.fileBase.fileNamePatterns.desc.find( pat => new RegExp(pat, 'i').test(e.fileName) );
}); });
@ -311,7 +279,7 @@ function populateFileEntryWithArchive(fileEntry, filePath, stepInfo, iterator, c
extractList.push(shortDescFile.fileName); extractList.push(shortDescFile.fileName);
} }
const longDescFile = entries.find( e => { const longDescFile = archiveEntries.find( e => {
return Config.fileBase.fileNamePatterns.descLong.find( pat => new RegExp(pat, 'i').test(e.fileName) ); return Config.fileBase.fileNamePatterns.descLong.find( pat => new RegExp(pat, 'i').test(e.fileName) );
}); });
@ -328,7 +296,8 @@ function populateFileEntryWithArchive(fileEntry, filePath, stepInfo, iterator, c
return callback(err); return callback(err);
} }
archiveUtil.extractTo(filePath, tempDir, archiveType, extractList, err => { const archiveUtil = ArchiveUtil.getInstance();
archiveUtil.extractTo(filePath, tempDir, fileEntry.meta.archive_type, extractList, err => {
if(err) { if(err) {
return callback(err); return callback(err);
} }
@ -384,6 +353,101 @@ function populateFileEntryWithArchive(fileEntry, filePath, stepInfo, iterator, c
return callback(null); return callback(null);
}); });
}, },
],
err => {
return cb(err);
}
);
}
function extractAndProcessSingleArchiveEntry(fileEntry, filePath, archiveEntries, cb) {
async.waterfall(
[
function extractToTemp(callback) {
// :TODO: we may want to skip this if the compressed file is too large...
temptmp.mkdir( { prefix : 'enigextract-' }, (err, tempDir) => {
if(err) {
return callback(err);
}
const archiveUtil = ArchiveUtil.getInstance();
// ensure we only extract one - there should only be one anyway -- we also just need the fileName
const extractList = archiveEntries.slice(0, 1).map(entry => entry.fileName);
archiveUtil.extractTo(filePath, tempDir, fileEntry.meta.archive_type, extractList, err => {
if(err) {
return callback(err);
}
return callback(null, paths.join(tempDir, extractList[0]));
});
});
},
function processSingleExtractedFile(extractedFile, callback) {
populateFileEntryInfoFromFile(fileEntry, extractedFile, err => {
if(!fileEntry.desc) {
fileEntry.desc = getDescFromFileName(filePath);
}
return callback(err);
});
}
],
err => {
return cb(err);
}
);
}
function populateFileEntryWithArchive(fileEntry, filePath, stepInfo, iterator, cb) {
const archiveUtil = ArchiveUtil.getInstance();
const archiveType = fileEntry.meta.archive_type; // we set this previous to populateFileEntryWithArchive()
async.waterfall(
[
function getArchiveFileList(callback) {
stepInfo.step = 'archive_list_start';
iterator(err => {
if(err) {
return callback(err);
}
archiveUtil.listEntries(filePath, archiveType, (err, entries) => {
if(err) {
stepInfo.step = 'archive_list_failed';
} else {
stepInfo.step = 'archive_list_finish';
stepInfo.archiveEntries = entries || [];
}
iterator(iterErr => {
return callback( iterErr, entries || [] ); // ignore original |err| here
});
});
});
},
function processDescFilesStart(entries, callback) {
stepInfo.step = 'desc_files_start';
iterator(err => {
return callback(err, entries);
});
},
function extractDescFromArchive(entries, callback) {
//
// If we have a -single- entry in the archive, extract that file
// and try retrieving info in the non-archive manor. This should
// work for things like zipped up .pdf files.
//
// Otherwise, try to find particular desc files such as FILE_ID.DIZ
// and README.1ST
//
const archDescHandler = (1 === entries.length) ? extractAndProcessSingleArchiveEntry : extractAndProcessDescFiles;
archDescHandler(fileEntry, filePath, entries, err => {
return callback(err);
});
},
function attemptReleaseYearEstimation(callback) { function attemptReleaseYearEstimation(callback) {
attemptSetEstimatedReleaseDate(fileEntry); attemptSetEstimatedReleaseDate(fileEntry);
return callback(null); return callback(null);
@ -413,18 +477,10 @@ function getInfoExtractUtilForDesc(mimeType, descType) {
return util; return util;
} }
function populateFileEntryNonArchive(fileEntry, filePath, stepInfo, iterator, cb) { function populateFileEntryInfoFromFile(fileEntry, filePath, cb) {
async.series(
[
function processDescFilesStart(callback) {
stepInfo.step = 'desc_files_start';
return iterator(callback);
},
function getDescriptions(callback) {
const mimeType = resolveMimeType(filePath); const mimeType = resolveMimeType(filePath);
if(!mimeType) { if(!mimeType) {
return callback(null); return cb(null);
} }
async.eachSeries( [ 'short', 'long' ], (descType, nextDesc) => { async.eachSeries( [ 'short', 'long' ], (descType, nextDesc) => {
@ -435,10 +491,11 @@ function populateFileEntryNonArchive(fileEntry, filePath, stepInfo, iterator, cb
const args = (util.args || [ '{filePath}'] ).map( arg => stringFormat(arg, { filePath : filePath } ) ); const args = (util.args || [ '{filePath}'] ).map( arg => stringFormat(arg, { filePath : filePath } ) );
exec(`${util.cmd} ${args.join(' ')}`, (err, stdout) => { execFile(util.cmd, args, { timeout : 1000 * 30 }, (err, stdout) => {
if(err) { if(err || !stdout) {
const reason = err ? err.message : 'No description produced';
logDebug( logDebug(
{ error : err.message, cmd : util.cmd, args : args }, { reason : reason, cmd : util.cmd, args : args },
`${_.upperFirst(descType)} description command failed` `${_.upperFirst(descType)} description command failed`
); );
} else { } else {
@ -463,7 +520,24 @@ function populateFileEntryNonArchive(fileEntry, filePath, stepInfo, iterator, cb
return nextDesc(null); return nextDesc(null);
}); });
}, () => { }, () => {
return callback(null); return cb(null);
});
}
function populateFileEntryNonArchive(fileEntry, filePath, stepInfo, iterator, cb) {
async.series(
[
function processDescFilesStart(callback) {
stepInfo.step = 'desc_files_start';
return iterator(callback);
},
function getDescriptions(callback) {
populateFileEntryInfoFromFile(fileEntry, filePath, err => {
if(!fileEntry.desc) {
fileEntry.desc = getDescFromFileName(filePath);
}
return callback(err);
}); });
}, },
function processDescFilesFinish(callback) { function processDescFilesFinish(callback) {

View File

@ -95,7 +95,7 @@ const PREDEFINED_MCI_GENERATORS = {
const byteSize = StatLog.getUserStatNum(client.user, 'dl_total_bytes'); const byteSize = StatLog.getUserStatNum(client.user, 'dl_total_bytes');
return formatByteSize(byteSize, true); // true=withAbbr return formatByteSize(byteSize, true); // true=withAbbr
}, },
UP : function userNumUploadsclient(client) { return userStatAsString(client, 'ul_total_count', 0); }, // Obv/2 UP : function userNumUploads(client) { return userStatAsString(client, 'ul_total_count', 0); }, // Obv/2
UK : function userByteUpload(client) { // Obv/2 uses UK=uploaded Kbytes UK : function userByteUpload(client) { // Obv/2 uses UK=uploaded Kbytes
const byteSize = StatLog.getUserStatNum(client.user, 'ul_total_bytes'); const byteSize = StatLog.getUserStatNum(client.user, 'ul_total_bytes');
return formatByteSize(byteSize, true); // true=withAbbr return formatByteSize(byteSize, true); // true=withAbbr

View File

@ -44,7 +44,8 @@
"uuid": "^3.0.1", "uuid": "^3.0.1",
"uuid-parse": "^1.0.0", "uuid-parse": "^1.0.0",
"ws" : "^2.3.1", "ws" : "^2.3.1",
"graceful-fs" : "^4.1.11" "graceful-fs" : "^4.1.11",
"exiftool" : "^0.0.3"
}, },
"devDependencies": {}, "devDependencies": {},
"engines": { "engines": {

View File

@ -19,6 +19,11 @@ const FILETYPE_HANDLERS = {};
[ 'PNG', 'JPEG', 'GIF', 'WEBP', 'XCF' ].forEach(ext => FILETYPE_HANDLERS[ext] = imageFile); [ 'PNG', 'JPEG', 'GIF', 'WEBP', 'XCF' ].forEach(ext => FILETYPE_HANDLERS[ext] = imageFile);
function audioFile(metadata) { function audioFile(metadata) {
// nothing if we don't know at least the author or title
if(!metadata.author && !metadata.title) {
return;
}
let desc = `${metadata.artist||'Unknown Artist'} - ${metadata.title||'Unknown'} (`; let desc = `${metadata.artist||'Unknown Artist'} - ${metadata.title||'Unknown'} (`;
if(metadata.year) { if(metadata.year) {
desc += `${metadata.year}, `; desc += `${metadata.year}, `;
@ -28,6 +33,11 @@ function audioFile(metadata) {
} }
function documentFile(metadata) { function documentFile(metadata) {
// nothing if we don't know at least the author or title
if(!metadata.author && !metadata.title) {
return;
}
let desc = `${metadata.author||'Unknown Author'} - ${metadata.title||'Unknown'}`; let desc = `${metadata.author||'Unknown Author'} - ${metadata.title||'Unknown'}`;
const created = moment(metadata.createdate); const created = moment(metadata.createdate);
if(created.isValid()) { if(created.isValid()) {
@ -86,7 +96,12 @@ function main() {
return -1; return -1;
} }
console.info(handler(metadata)); const info = handler(metadata);
if(!info) {
return -1;
}
console.info(info);
return 0; return 0;
}); });
}); });