ingestion

class virtualitics_sdk.nlp.ingestion.DataDiagnostic(data_upload_step, pipeline_config_step, data_filtering_step, advanced_dash=False, downsample_data=False, num_samples=10000, sample_percentage=0.5)

Bases: Step

build_advanced_dashboard(nlp_stats_df, store_interface, nlp_module, current_page)
build_simple_dashboard(nlp_stats_df, store_interface, nlp_module, current_page)
default_narrative = 'joint_narrative_f'
static filter_by_date(pandas_df, dt_col, range_vals)
get_dependencies(store_interface)
logger = <Logger DataDiagnostic (INFO)>
main_section = 'Data Diagnostic'
merge_narrative_features(input_df, cols, drop=False)
run(flow_metadata)
class virtualitics_sdk.nlp.ingestion.DateTimeFiltering(pipeline_config_step, data_upload_step)

Bases: Step

static cast_column(df_column, date_format)
Return type:

Series

dt_range = 'Document Time Window'
get_dependencies(store_interface)
logger = <Logger DateTimeFiltering (INFO)>
static produce_plot_image(x, y, img_title='', img_descr='')
run(flow_metadata)
virtualitics_sdk.nlp.ingestion.create_average_dep_depth_hist(nlp_stats_df)
virtualitics_sdk.nlp.ingestion.doc_len_infographic(nlp_stats)
Return type:

Tuple[InfographData, InfographData]

virtualitics_sdk.nlp.ingestion.empty_docs_infographic(nlp_stats)
Return type:

InfographData

virtualitics_sdk.nlp.ingestion.events_entities_infographics(nlp_stats)
Return type:

List[InfographData]

virtualitics_sdk.nlp.ingestion.generate_pos_summary_df(nlp_stats_df)
virtualitics_sdk.nlp.ingestion.mean_dependency_tree_depth(nlp_stats)
Return type:

InfographData

virtualitics_sdk.nlp.ingestion.mean_sents_info(nlp_stats, threshold=2.5)
Return type:

InfographData

virtualitics_sdk.nlp.ingestion.mean_unique_words(nlp_stats)
Return type:

InfographData

virtualitics_sdk.nlp.ingestion.sent_len_infographic(nlp_stats)
Return type:

Tuple[InfographData, InfographData]

virtualitics_sdk.nlp.ingestion.unique_ents_features_in_corpus(ents_table)
virtualitics_sdk.nlp.ingestion.unique_events_features_in_corpus(events_table)