Useful MySQL queries

Please note that Datameer doesn't recommend using direct access to the application database in production environments on a regular basis. The database schema might change from version to version without notification. 

Ensure that you always have a recent backup before executing any query in the production database.

Database Status Queries

#Display the current processes
show full processlist;

 

#Display the innoDb's status (good for viewing dead locks)
show engine innodb status;

Job Link Queries

#Import to Workbook link
select df3.id, df3.name ImportName,
       wc.id WbConfigId, df.name WorkbookName,
       sht.id SheetId, sht.name SheetName
from workbook_configuration wc
inner join dap_job_configuration djc on wc.id = djc.id
inner join dap_file df on djc.dap_file__id = df.id
inner join sheet sht on wc.id = sht.workbook_fk
inner join data_source_configuration dsc on sht.data_source_id = dsc.id
inner join dap_job_configuration djc3 on dsc.id = djc3.id
inner join dap_file df3 on djc3.dap_file__id = df3.id
order by df3.name;

 

#Workbook to Workbook link
select sht.external_sheet__id, df2.name ParentWorkbookName, sht2.name ParentSheetName,
       wc.id WbConfigId, df.name WorkbookName,
       sht.id SheetId, sht.name SheetName
from workbook_configuration wc
inner join dap_job_configuration djc on wc.id = djc.id
inner join dap_file df on djc.dap_file__id = df.id
inner join sheet sht on wc.id = sht.workbook_fk
inner join sheet sht2 on sht.external_sheet__id = sht2.id
inner join workbook_configuration wc2 on wc2.id = sht2.workbook_fk
inner join dap_job_configuration djc2 on wc2.id = djc2.id
inner join dap_file df2 on djc2.dap_file__id = df2.id
order by df2.name;

 

#Workbook to Infographic link
select wc.id WbConfigId, df2.name WorkbookName,
       sht.id SheetId, sht.name SheetName,
       igm.id IGMId, df.id DapFileId, df.name InfographicName
from infographic_data_set igds
inner join infographic_model_infographic_data_set igmigds on igmigds.data_sets__id = igds.id
inner join infographic_model igm on igmigds.infographic_model__id = igm.id
inner join dap_file df on igm.dap_file__id = df.id
inner join sheet sht on igds.data_set_name = sht.name
inner join workbook_configuration wc on wc.id = sht.workbook_fk
inner join dap_job_configuration djc on wc.id = djc.id
inner join dap_file df2 on djc.dap_file__id = df2.id
order by igm.id;

 

#Workbook to ExportJob link
select wc.id WbConfigId, df2.name WorkbookName,
       sht.id SheetId, sht.name SheetName,
       df.id DapFileId, df.name ExportJobName
from dap_file df
inner join dap_job_configuration djc on df.id = djc.dap_file__id
inner join data_sink_configuration dsc on djc.id = dsc.id
inner join sheet sht on dsc.sheet_fk = sht.id
inner join workbook_configuration wc on wc.id = sht.workbook_fk
inner join dap_job_configuration djc2 on wc.id = djc2.id
inner join dap_file df2 on djc2.dap_file__id = df2.id
order by df2.name;

 

# Data Connection to Import Data
select ds.type_id DataSourceType,
       df.id DataStoreId, df.name DataStoreName,
       df2.id ImportDataId, df2.name ImportDataName
from data_store ds
inner join dap_file df on ds.dap_file__id = df.id
inner join data_source_configuration dsc on ds.id = dsc.connection_fk
inner join dap_job_configuration djc on dsc.id = djc.id
inner join dap_file df2 on djc.dap_file__id = df2.id;

 

# Data Connection to Import Data with Permission and Groups Information
select ds.type_id DataSourceType,
       df.id DataStoreId, df.name DataStoreName,
       df2.id ImportDataId, df2.name ImportDataName, 
       p.other_permission_bits ImportWorldPermissions, p.owner ImportOwner,
       gp.group_name ImportGroupName, gp.permission_bits ImportGroupPermission
from data_store ds
inner join dap_file df on ds.dap_file__id = df.id
inner join data_source_configuration dsc on ds.id = dsc.connection_fk
inner join dap_job_configuration djc on dsc.id = djc.id
inner join dap_file df2 on djc.dap_file__id = df2.id
inner join permission p on df2.permission_fk = p.id
left outer join group_permission gp on p.id = gp.permission__id
order by df.id, df2.id;

The SQL query below lists the data source type, id, and name, and then any dependent import jobs, their id, name, owner and table name, and also any dependent workbooks with their id, name and specific sheet id and name.

#Data connextion to Import Job + ID, Name, Owner + Workbook
SELECT ds.type_id DataSourceType, 
	dsf.id DataStoreId, 
	dsf.name DataStoreName, 
	ijf.id ImportDataId, 
	ijf.name ImportDataName, 
	p.owner ImportOwner, 
	jcp.property_value as TableName, 
	wc.id WbConfigId, 
	wbf.name WorkbookName, 
	sht.id SheetId, 
	sht.name SheetName 
FROM workbook_configuration wc, 
	dap_job_configuration wbc, 
	dap_file wbf, 
	sheet sht, 
	data_source_configuration ijc, 
	dap_job_configuration dsc, 
	dap_file ijf, 
	data_store ds, 
	dap_file dsf, 
	data_source_configuration dssc, 
	dap_job_configuration ijjc, 
	permission p, 
	job_configuration_property jcp 
WHERE jcp.property_key = 'tableName' 
	AND wc.id = wbc.id 
	AND wbc.dap_file__id = wbf.id 
	AND wc.id = sht.workbook_fk 
	AND sht.data_source_id = ijc.id 
	AND ijc.id = dsc.id 
	AND dsc.dap_file__id = ijf.id 
	AND ds.dap_file__id = dsf.id 
	AND ds.id = dssc.connection_fk 
	AND dssc.id = ijjc.id 
	AND ijjc.dap_file__id = ijf.id 
	AND ijf.permission_fk = p.id 
	AND ijjc.id = jcp.configuration_fk;

This query contains a stored procedure which uses the parent - child folder relationships in the Datameer database and then constructs a location path to artifacts and outputs these in the result.

#Data Connection to Export Data
SELECT ds.type_id DataSourceType, df.id DataStoreId, df.name DataStoreName, df2.id ExportDataId, df2.name ExportDataName
FROM data_store ds
	INNER JOIN dap_file df ON ds.dap_file__id = df.id
	INNER JOIN data_sink_configuration dsc ON ds.id = dsc.connection_fk
	INNER JOIN dap_job_configuration djc ON dsc.id = djc.id
	INNER JOIN dap_file df2 ON djc.dap_file__id = df2.id;

The next request finds all the workbooks and export jobs based on the list of the workbook job execution IDs specified in the 'where' clause. Replace the (1,2,3,4,5,6,7,8,9,10) with the actual workbook job ID list.

#Job Execution ID to Workbooks and Export Jobs
SELECT group_concat(dje.id separator ',') as ListOfJobIDs,djc.id as WorkbookID,df1.name as WorkbookName,djc2.id as ExportJobID,df2.name as ExportJobName 
FROM file_dependency fd 
	INNER JOIN dap_file df1 ON df1.id=fd.dependency_file_id AND df1.extension='WORKBOOK_EXTENSION' 
	INNER JOIN dap_file df2 ON df2.id=fd.source_file_id AND df2.extension='EXPORT_JOB_EXTENSION' 
	INNER JOIN dap_job_configuration djc ON djc.dap_file__id=df1.id 
	INNER JOIN dap_job_execution dje ON dje.dap_job_configuration__id=djc.id 
	INNER JOIN dap_job_configuration djc2 ON djc2.dap_file__id=df2.id 
WHERE dje.id in (1,2,3,4,5,6,7,8,9,10) 
	GROUP BY ExportJobID;

This statement finds all the import jobs and data links linked to the workbooks based on the workbook execution job IDs. Replace the (1,2,3,4,5,6,7,8,9,10) with the actual workbook job ID list.

#Workbook Execution Job ID to Import Job and Data Link
SELECT group_concat(dje.id separator ',') AS ListOfJobIDs, djc.id AS WorkbookID, djc2.id AS ImportID, df1.name AS ImportName, df2.name AS WorkbookName 
FROM file_dependency fd 
	INNER JOIN dap_file df1 on df1.id=fd.dependency_file_id AND (df1.extension='IMPORT_JOB_EXTENSION' OR df1.extension='IMPORT_LINK_JOB_EXTENSION') 
	INNER JOIN dap_file df2 on df2.id=fd.source_file_id AND df2.extension='WORKBOOK_EXTENSION' 
	INNER JOIN dap_job_configuration djc ON djc.dap_file__id=df2.id 
	INNER JOIN dap_job_execution dje ON dje.dap_job_configuration__id=djc.id 
	INNER JOIN dap_job_configuration djc2 ON djc2.dap_file__id=df1.id 
WHERE dje.id in (1,2,3,4,5,6,7,8,9,10) 
	GROUP BY ImportID;

Find all shared groups for a particular job:

#Shared groups for job
SELECT * FROM group_permission gp 
	JOIN dap_file df ON gp.permission__id = df.permission_fk 
	JOIN dap_job_configuration djc ON df.id = djc.dap_file__id 
WHERE djc.id = 8;

Job Execution Queries

This query provides the job start/end time and some additional information. To get the mappers and reducers and time between tasks, you need to save and sift through the job trace and task logs.

#Current state of executions.
select djc.id JobConfigId, 
       CASE dje.job_status 
       WHEN 0 THEN 'QUEUED'
       WHEN 1 THEN 'RUNNING'
       WHEN 2 THEN 'COMPLETED'
       WHEN 3 THEN 'ERROR'
       WHEN 4 THEN 'COMPLETED_WITH_WARNINGS'
       WHEN 5 THEN 'WAITING_FOR_OTHER_JOB'
       WHEN 6 THEN 'CANCELED'
       WHEN 7 THEN 'ABORTING'
       WHEN 8 THEN 'WAITING'
       ELSE 'OTHER' END JobConfigStatus,
       dje.id JobExeId, 
       dje.start_time StartTime, 
       dje.stop_time StopTime,
       df.name Name, 
       dje.user Executor,
	   TIMEDIFF(dje.stop_time, dje.start_time) Duration
from dap_job_configuration djc
inner join dap_file df on djc.dap_file__id = df.id
inner join dap_job_execution dje on djc.id = dje.dap_job_configuration__id;

Note that Datameer only keeps the last 28 days of execution data until it is expunged. This is a configuration value found in the <DM_HOME>/conf/default.properties

# Define the maximum number of days job executions are saved in the job history, after a job has been completed. 
housekeeping.execution.max-age=28d

 

# All data related to a workbook w/ and w/out an execution id
select djc.id JobConfigId, djc.dap_file__id ConfigFileId, df.name ConfigName, d.id DataId, d.uri Uri, df2.id DataFileId, 
       df2.name DataName, df2.creation_date DataCreationDate, df2.extension DataFileExtension, df2.file_mode DataFileMode,
       dje.id ExeId, dje.created_data__id ExeDataId
from dap_job_configuration djc
inner join dap_file df on djc.dap_file__id = df.id
inner join data d on djc.id = d.dap_job_configuration__id
inner join dap_file df2 on d.dap_file__id = df2.id
left outer join dap_job_execution dje on djc.id = dje.dap_job_configuration__id and d.id = dje.created_data__id
where djc.id=<id>;

 

#workbook to worksheet to current selected sheet
select wc.id WbConfigId, df.id DapFileId, df.name WbName,
       sht.id SheetId, sht.name SheetName, sht.position Position, sht.keep Keep,
       wbv.current_sheet_index CurrentSheetNo
from workbook_configuration wc 
inner join dap_job_configuration djc on wc.id = djc.id 
inner join dap_file df on djc.dap_file__id = df.id 
inner join sheet sht on wc.id = sht.workbook_fk
inner join workbook_viewstate wbv on wc.workbook_view_fk = wbv.id
order by df.name, sht.position;

Job Configuration Queries

#Find Configurations with most data objects
SELECT dap_job_configuration__id, count(*) AS c 
FROM data 
WHERE data.status = 0 
	GROUP BY data.dap_job_configuration__id 
	ORDER BY c DESC limit 25;

 

#Datastore Config Info
SELECT ds.id DataStoreId, dscp.property_key, dscp.property_value, df.name Name
FROM data_store ds
	INNER JOIN data_store_configuration_property dscpON dscp.configuration_fk = ds.id
	INNER JOIN dap_file df ON ds.dap_file__id = df.id
WHERE df.name = 'SystemDatabase';

 

#Get data-source with max-mapper settings
SELECT dsc.id, dsc.max_mappers, ds.type_id, df.extension
FROM data_source_configuration dsc, data_store ds, dap_job_configuration djc, dap_file df
WHERE dsc.connection_fk = ds.id 
	AND dsc.id = djc.id 
	AND df.id=djc.dap_file__id 
	AND dsc.max_mappers IS NOT NULL;

 

#Get custom hadoop properties from data-stores
SELECT dscp.id, dscp.property_key, dscp.property_value, ds.type_id 
FROM data_store_configuration_property AS dscp 
	INNER JOIN data_store AS ds ON dscp.configuration_fk = ds.id 
WHERE dscp.property_key='hadoop.properties';

 

#Get job-configurations with max-split settings
SELECT * 
FROM dap_job_configuration 
WHERE hadoop_properties LIKE "%das.splitting.max-split-count%";

 

#Get job-configurations of user
SELECT djc.id, df.name, djc.hadoop_properties
FROM dap_job_configuration djc
	INNER JOIN dap_file df ON djc.dap_file__id = df.id
	INNER JOIN permission p ON df.permission_fk = p.id
WHERE p.owner = 'system'

 

#Get hadoop properties from cluster setup
SELECT * 
FROM property 
WHERE name = "hadoop.customPropertiesString";

 

#Custom "Table Name" query for Export Job
SELECT name ExportJobName, table_name DbTableName, create_new_table NewTable, record_replacement ReplaceRecords
FROM data_base_data_sink dbds 
	INNER JOIN dap_job_configuration djc ON dbds.id = djc.id
	INNER JOIN dap_file df ON djc.dap_file__id = df.id
WHERE dbds.id = '<configurationID>';

 

Whereby the configuration ID is the job configuration ID of your particular export job.

 

License Volume Queries

#License Consumption By Day
select day, round((sum(volume)/1024)/1024,2) volume_mb
from data_volume_summary
group by day
order by day;
 
#License Consumption By Month
select SUBSTRING(day,1,7) 'month', round(((sum(volume)/1024)/1024)/1024,2) volume_gb
from data_volume_summary
group by SUBSTRING(day,1,7)
order by day;
 
#License Consumption By Year
select SUBSTRING(day,1,4) 'year', round((((sum(volume)/1024)/1024)/1024)/1024,4) volume_tb
from data_volume_summary
group by SUBSTRING(day,1,4)
order by day;

Artifact Volume Queries

#List paths of Workbooks and Worksheets with disk usage
/* Aggregated by Workbook name */
SELECT d.uri URI,
df.name,
wc.id ConfigID,
sum(wsd.kept) AS Kept,
count(wsd.kept) AS TotalSheets,
(sum(wsd.kept) / count(wsd.kept)) * 100 AS Percentage,
(sum(wsd.data_statistic__bytes) / 1024 / 1024) AS TotalMBytes
FROM workbook_configuration wc,
workbook_sheet_data wsd,
data d, dap_file df,
dap_job_configuration djc
WHERE d.dap_job_configuration__id = wc.id
AND d.id=wsd.workbook_data_fk
AND d.dap_job_configuration__id = djc.id
AND df.id = djc.dap_file__id
GROUP BY wc.id, d.uri, df.name
ORDER BY TotalMBytes desc,
Percentage desc;

/* Aggregated by Worksheet name */
SELECT wc.id ConfigID,
d.uri URI,
wsd.sheet_name WorksheetName,
wsd.data_statistic__bytes Bytes
FROM workbook_configuration wc, workbook_sheet_data wsd, data d, dap_file df, dap_job_configuration djc
WHERE d.dap_job_configuration__id = wc.id
AND d.id=wsd.workbook_data_fk
AND d.dap_job_configuration__id = djc.id
and df.id = djc.dap_file__id
order by Bytes desc;

Custom "Filter By Partitions" query for Workbooks

#Get a list of Workbooks and "Filter By Partitions"
SELECT sh.workbook_fk ID, sh.name Name, sh.partition_selector PartDate 
FROM sheet sh
WHERE sh.partition_selector LIKE '%TODAY%';

Number of Input Data Objects for Workbooks

SELECT sheet.workbook_fk, count(*) 
FROM data, sheet 
WHERE data.dap_job_configuration__id = sheet.data_source_id 
	AND data.status = 0 
	AND sheet.data_source_id IS NOT NULL
	AND sheet.workbook_fk IN (<Configuration ID>) 
	GROUP BY sheet.workbook_fk;

Most Frequently Executed Workbooks

SELECT configuration_id AS ConfigID,
  SUM(jobs_executed) + SUM(mr_jobs_executed) AS Executions 
FROM daily_workbook_statistics 
  GROUP BY configuration_id 
  ORDER BY Executions DESC 
  LIMIT 10; 

Workbooks With the Most Number of Sheets

SELECT workbook_fk as ConfigID, 
  COUNT(id) AS Sheets 
FROM sheet 
  GROUP BY workbook_fk 
  ORDER BY Sheets DESC
  LIMIT 10;

 

Database Size Query

SELECT TABLE_NAME, table_rows, data_length, index_length, 
round(((data_length + index_length) / 1024 / 1024),2) 'SizeMB'
FROM information_schema.TABLES 
WHERE table_schema = 'dap';

Consumed License Volume Per Import Job (since 2.1.4)

select job_conf_id, sum(volume) from data_volume_summary group by job_conf_id;

Consumed License Volume Per User (since 2.1.4)

select permission.owner, sum(data_volume_summary.volume) from data_volume_summary, dap_job_configuration, dap_file, permission where data_volume_summary.job_conf_id = dap_job_configuration.id and dap_job_configuration.dap_file__id = dap_file.id and dap_file.permission_fk = permission.id group by permission.owner;

List all existing artifacts and group membership

#All artifacts and group membership
SELECT df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, gp.group_name
FROM dap_file df  
	LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id
WHERE df.extension LIKE '%EXTENSION%' AND df.file_mode = 'NORMAL';

As to each artifact a data object might exist, we should gather that too. 

#All artifacts with data objects and group membership
SELECT df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, gp.group_name
FROM dap_file df  
LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id
WHERE df.extension LIKE '%EXTENSION%' AND NOT df.extension LIKE '%SYSTEM%';

List ImportJobs and Workbooks configured to keep more than 1 result

SELECT djc.id, df.name, df.extension 
FROM dap_file df, dap_job_configuration djc
WHERE df.extension in ('IMPORT_JOB_EXTENSION','WORKBOOK_EXTENSION')
	AND df.file_mode = 'NORMAL'
	AND djc.dap_file__id = df.id
	AND (djc.min_keep_count is null OR djc.min_keep_count > 1);

List ImportJobs with amount of partitioned files

SELECT df1.name ArtifactName,
       perm.owner,
       djc.id ArtifactId,
       COUNT(d.id) Pieces, 
       df1.extension Type,
       df1.last_executed LastExecuted,
       df1.bytes_processed, 
   SUM(dp.data_statistic__bytes) PartBytes,
   SUM(dp.data_statistic__record_count) PartRecCount, 
   SUM(dp.data_statistic__uncompressed_bytes) PartUncompBytes 
FROM dap_job_configuration djc 
    INNER JOIN dap_file df1 ON djc.dap_file__id = df1.id 
    INNER JOIN data d ON djc.id = d.dap_job_configuration__id 
    INNER JOIN  dap_file df2 ON d.dap_file__id = df2.id 
    INNER JOIN  permission perm ON df1.permission_fk = perm.id 
    LEFT OUTER JOIN data_partition dp ON d.partition_index__id = dp.partition_index__id 
WHERE df1.extension = "IMPORT_JOB_EXTENSION" 
GROUP BY df1.name 
ORDER BY pieces DESC;

List owners and permissions for workbooks

SELECT wc.id WbConfigId, 
    df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, 
    perm.id PermissionId, perm.owner Owner, perm.other_permission_bits OtherPermissionBit, 
    gp.id GroupPermId, gp.group_name GroupName, gp.permission_bits GroupPerms 
    FROM workbook_configuration wc 
        INNER JOIN dap_job_configuration djc on wc.id = djc.id 
        INNER JOIN dap_file df on djc.dap_file__id = df.id 
        INNER JOIN permission perm on wc.full_data_permission_fk = perm.id 
        LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id; 

List all scheduled jobs and its schedule

SELECT conf.`id`, file.`name`, file.`extension`, conf.`pull_type`, conf.`schedule` 
    FROM dap_job_configuration conf 
        JOIN dap_file file ON conf.`dap_file__id` = file.`id` 
    WHERE conf.`pull_type` = 2;

Where the pull_type value has the following meaning: 

0: Manually  
1: When new data comes in  
2. Scheduled

 

List all Scheduled Jobs with Schedule and Owner

SELECT DISTINCT
job_owners.df_name Name,
CASE job_owners.df_extension
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
job_owners.user_name Owner,
djc.id ConfigID,
job_owners.uuid UUID,
job_owners.file_id FileID,
CASE djc.pull_type
WHEN 1 THEN 'When new data comes in'
WHEN 2 THEN 'Scheduled'
END PullType,
djc.schedule Schedule
FROM dap_job_configuration djc,
(
SELECT df.name df_name, df.extension df_extension, df.uuid, df.id file_id, u.name user_name
FROM dap_file df, permission p, user u
WHERE df.permission_fk = p.id
AND p.owner = u.name
AND u.name <> 'admin'
AND df.file_mode = 'NORMAL'
) job_owners
WHERE djc.dap_file__id = job_owners.file_id
AND djc.pull_type <> 0;

Find un-scheduled jobs

SELECT 
  dap_job_configuration__id, 
  start_time, 
  stop_time, 
  triggered_by,
  schedule, 
  dap_file.id AS FileID,
  dap_file.name AS Name,
  dap_file.extension,
  dap_file.last_executed as lastExecuted,
  dap_file.last_changed_date as lastChanged
FROM 
  dap_job_configuration,
  dap_job_execution,
  dap_file 
WHERE 
  dap_job_configuration__id = dap_job_configuration.id 
    AND !(schedule IS NULL) 
    AND dap_file.id = dap_file__id; 

Find scheduled export jobs

select conf.`id`, file.`name`, file.`extension`, conf.`pull_type`, conf.`schedule` 
	from dap_file file 
		join dap_job_configuration conf ON file.`id` = conf.`dap_file__id` 
	where (conf.pull_type = 1 or conf.pull_type = 2) and file.extension = 'EXPORT_JOB_EXTENSION';

Find jobs with no connection to others

SELECT 
  name ArtifactName,
  CASE extension
    WHEN 'DATA_STORE_EXTENSION' THEN 'Connection'
    WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
    WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
    WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
    WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
  END Type,
  last_changed_date lastChanged,
  last_executed lastExecuted,
  id
FROM dap_file 
WHERE extension NOT IN ('SYSTEM_JOB_EXTENSION',
                        'SYSTEM_JOB_DATA_EXTENSION',
                        'JPG', 'PNG',
                        'WORKBOOK_DATA_EXTENSION',
                        'IMPORT_JOB_DATA_EXTENSION',
                        'EXPORT_JOB_DATA_EXTENSION')
  AND id NOT IN (SELECT dependency_file_id FROM file_dependency) 
  AND id NOT IN (SELECT source_file_id FROM file_dependency) 
  AND folder_fk != (SELECT id FROM folder WHERE name = '.system'); 

Find jobs not executed in the last days

SELECT 
    name ArtifactName,
  CASE extension
    WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
    WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
    WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
    WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
  END Type,
  last_changed_date lastChanged,
  last_executed lastExecuted,
  schedule Schedule
FROM 
  dap_file, dap_job_configuration as job_conf 
  WHERE job_conf.dap_file__id=dap_file.id 
    AND schedule IS NOT null 
    AND last_executed <= ADDDATE(CURRENT_DATE(), 1) 
    AND last_executed >= SUBDATE(CURRENT_DATE(), 10); 

 

List all referencing workbooks that prevent other workbook datas from being deleted

If you see that a workbook data with ID <xyz> is not getting deleted, this usually is because other ACTIVE workbooks are referencing it. You can find those reference workbooks with the following query:

SELECT data.id 'Ref By Data ID', data.status 'Status', data.dap_job_configuration__id 'Workbook ID', workbook_sheet_data.id 'Ref By Sheet Data ID', workbook_sheet_data.sheet_name 'Sheet Name' 
    FROM data, workbook_sheet_data, workbook_sheet_external_data 
    WHERE data.id = workbook_sheet_data.workbook_data_fk 
        AND workbook_sheet_data.id = workbook_sheet_data_id 
        AND referenced_workbook_sheet_data_id IN (select workbook_sheet_data.id from workbook_sheet_data, data where workbook_sheet_data.workbook_data_fk = data.id and data.id = xyz);

You can now do the following:

  1. Delete the referencing workbook data from the UI (in case it is not needed anymore).
  2. Set the keep flag of the referencing sheet to false so that this doesn't happen in the future.

Files shared to the world

### Sharing for "Others" enabled on Import Jobs, Data Links, File Uploads, Workbooks, and Export Jobs
 
SELECT djc.id,
       df.name Name,
       df.extension Type,
       p.owner Owner,
       p.other_permission_bits Others
FROM dap_file df, 
     dap_job_configuration djc,
     permission p
WHERE p.other_permission_bits != '___' 
  AND df.file_mode = 'NORMAL'
  AND df.permission_fk = p.id 
  AND djc.dap_file__id = df.id;

### Infographics "other sharing enabled" 
 
SELECT im.id,
       df.name Name,
       p.owner Owner,
       p.other_permission_bits others_sharing
FROM dap_file df,
     infographic_model im,
     permission p
WHERE p.other_permission_bits != '___'
  AND df.file_mode = 'NORMAL'
  AND df.permission_fk = p.id
  AND im.dap_file__id = df.id;

### Workbooks sharing full data results
 
SELECT wc.id,
       df.name AS WorkbookName,
       p.owner AS Owner,
       p.other_permission_bits AS Permissions
FROM workbook_configuration wc
  INNER JOIN dap_job_configuration djc ON wc.id = djc.id
  INNER JOIN dap_file df ON djc.dap_file__id = df.id
  INNER JOIN permission p ON wc.full_data_permission_fk = p.id
WHERE df.file_mode = 'NORMAL' 
  AND p.other_permission_bits != '___';

### Public Infopgraphics
 
SELECT im.id, 
       df.name Name, 
       p.owner Owner
FROM dap_file df, 
     infographic_model im,
     permission p
WHERE p.shared != 0 
  AND df.file_mode = 'NORMAL' 
  AND df.permission_fk = p.id 
  AND im.dap_file__id = df.id;

Amount of Bytes for each executed artifact

(Don't look at dap_job_execution table because it gets cleaned up every 28d by housekeeping.)

select df1.name ArtifactName, djc.id ArtifactId, df2.name DataName, d.id DataId,
df1.extension Type, df1.last_executed LastExecuted,
djc.data_volume_size_by_license_period dataVolLicPer, df1.bytes_processed,
sum(dp.data_statistic__bytes) PartBytes, sum(dp.data_statistic__record_count) PartRecCount,
sum(dp.data_statistic__uncompressed_bytes) PartUncompBytes
from dap_job_configuration djc
inner join dap_file df1 on djc.dap_file__id = df1.id
inner join data d on djc.id = d.dap_job_configuration__id
inner join dap_file df2 on d.dap_file__id = df2.id
left outer join data_partition dp on d.partition_index__id = dp.partition_index__id
GROUP BY df2.id
order by djc.id;
# group by df2.id will sum up the partition based on dataId
# group by df1.id will sum up based on artifactId

Workbooks attached to dashboards

Workbook info and last execution details that are attached to widgets for old dashboards. Use this query to figure out if there are upgrade risks for customers that were in 1.x as dashboards are no longer supported for view in 4.3 +

select djc.*, df.*, u.name as owner, u.email, u.login_date as lastLoginDate
from dap_job_configuration djc, dap_file df, widget w, sheet s, workbook_configuration wc, permission p, user u
where w.sheet_fk = s.id
and s.workbook_fk = wc.id
and djc.id = wc.id
and df.id = djc.dap_file__id
and p.id = df.permission_fk
and u.name = p.owner;

 

All exports to files and databases and their locations and connections

SELECT exportjob.name as exportjob, f.name as connectionName, dsc.connection_fk,coalesce(dscpHDFS.property_value, dscpHive.property_value) databaseOrPath, coalesce(file_name, table_name) target 
FROM data_sink_configuration dsc
left outer join file_data_sink fds on fds.id=dsc.id
left outer join data_base_data_sink dbds on dbds.id=dsc.id
inner join data_store ds on ds.id=dsc.connection_fk
left outer join data_store_configuration_property dscpHDFS on dscpHDFS.configuration_fk = ds.id and dscpHDFS.property_key='rootPathPrefix'
left outer join data_store_configuration_property dscpHive on dscpHive.configuration_fk = ds.id and dscpHive.property_key='databaseFilter'
inner join dap_file f on f.id=ds.dap_file__id
inner join dap_job_configuration job on job.id=dsc.id
inner join dap_file exportJob on exportJob.id=job.dap_file__id;

Space Consuming Jobs

SELECT df.name 'Job',
CASE df.extension
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
d.dap_job_configuration__id ConfigID,p.owner Owner,d.uri URI,
round(df.bytes_processed/1024/1024/1024,2) 'Size in GB',
d.effective_date 'Effective Date',d.id 'Data Id'
FROM data d, dap_file df , permission p, dap_job_configuration djc
WHERE
df.extension in ('IMPORT_JOB_EXTENSION','WORKBOOK_EXTENSION','EXPORT_JOB_EXTENSION','IMPORT_LINK_JOB_EXTENSION')
AND djc.dap_file__id = df.id AND djc.id = d.dap_job_configuration__id
AND p.id = df.permission_fk
order by df.bytes_processed desc;

 

List the owner of all artifacts together with configID, fileID and UUID and filtered by user if necessary.

SELECT df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
perm.owner Owner,
djc.id ConfigID,
djc.dap_file__id FileID,
df.uuid UUID
FROM dap_file df,
dap_job_configuration djc,
permission perm
WHERE df.extension IN ('UPLOAD_JOB_EXTENSION',
'IMPORT_LINK_JOB_EXTENSION',
'IMPORT_JOB_EXTENSION',
'WORKBOOK_EXTENSION',
'EXPORT_JOB_EXTENSION')
AND df.file_mode = 'NORMAL'
AND djc.dap_file__id = df.id
AND perm.id = df.permission_fk;
/* AND perm.owner LIKE '%<user>%' */

Get the file name, fileID and UUID for each file in a specific folder

SELECT * FROM
(SELECT concat('/',GetAncestry(df.folder_fk),'/') AS PATH,
df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
p.owner AS Owner,
p.id AS PermissionID,
djc.id AS ConfigID,
djc.dap_file__id AS FileID,
df.uuid AS UUID
FROM permission p,
dap_job_configuration djc,
dap_file df
WHERE p.id = df.permission_fk
AND df.id = djc.dap_file__id) A
WHERE lower(A.PATH) LIKE '/%'
ORDER BY A.OWNER, A.PATH;

SQL statement which delivers a ls -Ral *.wbk as an output

SELECT
concat('/',GetAncestry(df.folder_fk),'/') AS Path,
df.name Name,
CASE df.extension
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
END Type,
perm.owner Owner,
gp.group_name "Group",
'RWX' AS Perms,
gp.permission_bits GroupPerms,
perm.other_permission_bits Others,
wc.id ConfigID,
df.id FileID,
df.uuid UUID,
df.bytes_processed Bytes,
df.last_changed_date Changed,
df.last_executed Executed
FROM workbook_configuration wc
INNER JOIN dap_job_configuration djc ON wc.id = djc.id
INNER JOIN dap_file df ON djc.dap_file__id = df.id
/*
As of v6.1 wc.data_permission_fk
*/
INNER JOIN permission perm ON wc.full_data_permission_fk = perm.id
LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id;

Top artifacts using data volume in the current term (since <timestamp>)

SELECT t1.job_conf_id AS ConfigID,
t1.GB,
df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type
FROM (
SELECT job_conf_id,
round(sum(volume)/1024/1024/1024,2) AS GB
FROM data_volume_summary /* WHERE day >= "1970-01-21 00:00:00" */
GROUP BY job_conf_id
ORDER BY GB DESC /* LIMIT 20 */
)
t1
JOIN dap_job_configuration djc ON djc.id = t1.job_conf_id
JOIN dap_file df ON df.id = djc.dap_file__id;