Source code for turbo_seti.find_event.find_event

#!/usr/bin/env python
r"""
Backend script to find drifting, narrowband events in a generalized cadence of
radio SETI observations (any number of ons, any number of offs, any pattern -
streamlined for alternating on-off sequences).

The main function contained in this file is :func:`find_events` uses the other
helper functions in this file (described below) to read a list of turboSETI .dat
files. It then finds events within this group of files.
"""

import time
import re
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
#^To remove pandas warnings: default='warn'


[docs]def end_search(t0):
    r"""
    Ends the search when there are no candidates left, or when the filter
    level matches the user-specified level.

    Parameters
    ----------
    t0 : time

    """
    #Report elapsed search time
    t1 = time.time()
    print('find_events: Elapsed time: %.2f sec' % ((t1-t0)))


[docs]def read_dat(filename):
    r"""
    Read a turboseti .dat file.

    Parameters
    ----------
    filename : str
        Name of .dat file to open.

    Returns
    -------
    df_data : dict
        Pandas dataframe of hits.

    """
    file_dat = open(filename.strip())
    hits = file_dat.readlines()

    # Get info from the .dat file header
    FileID = hits[1].strip().split(':')[-1].strip()
    Source = hits[3].strip().split(':')[-1].strip()

    MJD = hits[4].strip().split('\t')[0].split(':')[-1].strip()
    RA = hits[4].strip().split('\t')[1].split(':')[-1].strip()
    DEC = hits[4].strip().split('\t')[2].split(':')[-1].strip()

    DELTAT = hits[5].strip().split('\t')[0].split(':')[-1].strip()  # s
    DELTAF = hits[5].strip().split('\t')[1].split(':')[-1].strip()  # Hz

    # As of version 2.1.0, there are 2 additional parameters:
    try:
        max_drift_rate = hits[5].strip().split('\t')[2].split(':')[-1].strip() # Hz/s
        obs_length = hits[5].strip().split('\t')[3].split(':')[-1].strip() # s
    except Exception as exc:
        msg = '*** Out-of-date DAT file format detected in {}.  Rerun doppler search!' \
              .format(filename)
        raise RuntimeError(msg) from exc

    # Get info from individual hits (the body of the .dat file)
    all_hits = []
    for hit_line in hits[9:]:
        hit_fields = re.split(r'\s+', re.sub(r'[\t]', ' ', hit_line).strip())
        all_hits.append(hit_fields)

    # Now reorganize that info to be grouped by column (parameter)
    # not row (individual hit)
    if all_hits:
        TopHitNum = list(zip(*all_hits))[0]
        DriftRate = [float(df) for df in list(zip(*all_hits))[1]]
        SNR = [float(ss) for ss in list(zip(*all_hits))[2]]
        Freq = [float(ff) for ff in list(zip(*all_hits))[3]]
        ChanIndx = list(zip(*all_hits))[5]
        FreqStart = list(zip(*all_hits))[6]
        FreqEnd = list(zip(*all_hits))[7]
        CoarseChanNum = list(zip(*all_hits))[10]
        FullNumHitsInRange = list(zip(*all_hits))[11]

        data = {'TopHitNum': TopHitNum,
                'DriftRate': DriftRate,
                'SNR': SNR,
                'Freq': Freq,
                'ChanIndx': ChanIndx,
                'FreqStart': FreqStart,
                'FreqEnd': FreqEnd,
                'CoarseChanNum': CoarseChanNum,
                'FullNumHitsInRange': FullNumHitsInRange
                }

        # Creating pandas dataframe from data we just read in
        df_data = pd.DataFrame(data)
        df_data = df_data.apply(pd.to_numeric)

    else:
        df_data = pd.DataFrame()

    # Matching column information from before to the .dat data we read in
    df_data['FileID'] = FileID
    df_data['Source'] = Source.upper()
    df_data['MJD'] = MJD
    df_data['RA'] = RA
    df_data['DEC'] = DEC
    df_data['DELTAT'] = DELTAT
    df_data['DELTAF'] = DELTAF

    # As of 2.1.0, 2 new dataframe columns:
    df_data['max_drift_rate'] = max_drift_rate
    df_data['obs_length'] = obs_length

    # Adding extra columns that will be filled out by this program
    df_data['Hit_ID'] = ''
    df_data['status'] = ''
    df_data['in_n_ons'] = ''
    df_data['RFI_in_range'] = ''

    return df_data


[docs]def calc_freq_range(hit, delta_t=0.0, max_dr=True, follow=False):
    r"""
    Calculates a range of frequencies where RFI in an off-source could
    be related to a hit in an on-source, given a freq and drift_rate.

    Parameters
    ----------
    hit : dict
    delta_t : float, optional
    max_dr : bool, optional
    follow : bool, optional

    Returns
    -------
    [low_bound, high_bound] : list

    """
    if max_dr:
        drift_rate = float(hit['max_drift_rate'])
    else:
        drift_rate = float(hit['DriftRate'])

    if drift_rate == 0.0:
        drift_rate = float(hit['DELTAF'])/float(hit['DELTAT'])
    if follow:
        freq = float(hit['Freq']) + drift_rate*(delta_t)/1e6
        delta_t = 2.0 * float(hit['obs_length'])  # NOTE: I'm doing twice the amount in order to widen the range a bit.
    else:
        freq = float(hit['Freq'])
        delta_t = delta_t + float(hit['obs_length'])  #adding to the end of the obs.

    low_bound  = freq - abs(drift_rate)*delta_t/1e6
    high_bound = freq + abs(drift_rate)*delta_t/1e6

    return [low_bound,high_bound]


[docs]def follow_event(hit, on_table, get_count=True):
    r"""
    Follows a given hit to the next observation of the same target and
    looks for hits which could be part of the same event.

    Parameters
    ----------
    hit : dict
    on_table : dict
    get_count : bool

    Returns
    -------
    new_on_table or count : dict or int

    """

    #uses calc_freq_range to see how much the hit *should* have drifted by
    freq_range = calc_freq_range(hit, delta_t=on_table['delta_t'].values[0], max_dr=False, follow=True)

    #looks at the on (next given observation) to see if there are any
    #hits that could plausibly be related to the first one
    new_on_table = on_table[(on_table['Freq']>freq_range[0]) & (on_table['Freq']<freq_range[1])]

    #we only care if there were or were not plausible hits, so turn output into
    #a binary variable with get_count=True
    if get_count:
        n_hits_in_range = len(new_on_table)

        #Avoiding cases where multiple hits in one obs, and none in the other.
        if n_hits_in_range:
            return 1
        return 0

    return new_on_table


[docs]def not_yet_seen(mylist, argument):
    """
    Search a list to see if argument is already there.

    Parameters
    ----------
    mylist : list
        List of things that have been already seen.
    argument : int
        An integer to add to list if not alreay seen.

    Returns
    -------
    bool
       True :: Not yet seen so the argument was added.
       False :: Already seen.
    """
    for ii in mylist:
        if ii == argument:
            return False

    mylist.insert(0, argument)
    return True



[docs]def find_events(dat_file_list, check_zero_drift=False, filter_threshold=3,
                on_off_first='ON', complex_cadence=False,
                SNR_cut=None, min_drift_rate=None, max_drift_rate=None):
    r"""
    Reads a list of turboSETI .dat files.

    Parameters
    ----------
    dat_file_list : list
        A Python list of .dat files with ON observations of a
        single target alternating with OFF observations. This
        cadence can be of any length, given that the ON source
        is every other file. This includes Breakthrough Listen
        standard ABACAD as well as OFF first cadences like
        BACADA. Minimum cadence length is 2, maximum cadence
        length is unspecified (currently tested up to 6).
    check_zero_drift : bool, optional
        A True/False flag that tells the program whether to
        include hits that have a drift rate of 0 Hz/s. Earth-
        based RFI tends to have no drift rate, while signals
        from the sky are expected to have non-zero drift rates.
        Default is False.
    filter_threshold : int, default is 3
        Specification for how strict the hit filtering will be.
        There are 3 different levels of filtering, specified by
        the integers 1, 2, and 3. 
        * Filter_threshold = 1 applies the following parameter checks:
            check_zero_drift
            SNR_cut
            min_drift_rate
            max_drift_rate
        However, Filter_threshold = 1 applies no ON-OFF check.
        * Filter_threshold = 2 returns hits that passed level 1
        AND that are in at least one ON table but no OFF tables.
        * Filter_threshold = 3 returns events that passed level 2
        AND that are present in *ALL* ON tables.
    on_off_first : str {'ON', 'OFF}, optional
        Tells the code whether the .dat sequence starts with
        the ON or the OFF observation. Valid entries are 'ON'
        and 'OFF' only.
    complex_cadence : bool, optional
        A Python list of 1s and 0s corresponding to which
        files in the file_sublist are on-sources and which are
        off_sources for complex (i.e. non alternating) cadences.
    SNR_cut : None (default value) or float value > 0
        If None, then all SNR values from the dedoppler results in the dat
        files are accepted as-is.
        Otherwise, the specified value is the threshold SNR below which
        hits will be discarded.
    min_drift_rate : None (default value) or float value > 0
        If None, then all drift rate values from the dedoppler results in the dat
        files are accepted as-is.
        Otherwise, the specified value is the threshold drift rate below which
        hits will be discarded.
    max_drift_rate : None (default value) or float value > 0
        If None, then all drift rate values from the dedoppler results in the dat
        files are accepted as-is.
        Otherwise, the specified value is the threshold drift rate above which
        hits will be discarded.
 
    Examples
    --------
    It is highly recommended that users interact with this program via the
    front-facing find_event_pipeline.py script. See the usage of that file in
    its own documentation.

    If you would like to run find_events without calling
    find_event_pipeline.py, the usage is as follows:

    >>> find_event.find_events(file_sublist, SNR_cut=10, check_zero_drift=False,
    ...                        filter_threshold=3, on_off_first='ON', complex_cadence=False)

    Notes
    -----
    It calls other functions to find events within this group of files.
    Filter_threshold allows the return of a table of events with hits at
    different levels of filtering.
    Filter_threshold = [1,2,3] means:
        1) Hits above an SNR cut witout AB check
        2) Hits that are only in some As and no Bs
        3) Hits that are only in all As and no Bs

    """
    #Initializing timer
    t0 = time.time()

    print(f"find_events: SNR_cut = {SNR_cut}")
    print(f"find_events: min_drift_rate = {min_drift_rate}")
    print(f"find_events: max_drift_rate = {max_drift_rate}")
    print("find_events: Loading data...")

    #Preparing to read in the list of files
    on_table_list = []
    off_table_list = []

    off_count = 1
    on_count = 1

    odd_even_indicator = 0
    on_off_indicator = 0
    if complex_cadence: # One source name was provided.
        number_of_ons = 0
    else: # complex_cadence=False
        number_of_ons = int(np.floor(len(dat_file_list) / 2.0))
        if on_off_first == 'ON':
            odd_even_indicator = 1
            number_of_ons = int(np.ceil(len(dat_file_list) / 2.0))

    #reading in the list of files
    for ii, dat_file in enumerate(dat_file_list):
        if complex_cadence:
            on_off_indicator = int(complex_cadence[ii])
            number_of_ons += on_off_indicator
        #Checking if the file is an on or off observation
        #if off
        if (ii%2 == odd_even_indicator and complex_cadence == False) \
        or (on_off_indicator == 0 and complex_cadence != False):
            #Using read_dat function to read the .dat file
            #and create the pandas hit table for OFF sources
            off_table_ii = read_dat(dat_file)
            nhits = len(off_table_ii)
            print('find_events: Loaded %i hits from %s (OFF)'%(nhits, dat_file))
            if nhits > 0:
                off_table_ii['status'] = 'off_table_{}'.format(off_count)
                #Grouping all of the off hits into one table
                off_table_list.append(off_table_ii)
                off_count += 1

        else:
            #Using read_dat function to read the .dat file
            #and create the pandas hit table for ON sources
            on_table_ii = read_dat(dat_file)
            nhits = len(on_table_ii)
            print('find_events: Loaded %i hits from %s (ON)'%(nhits, dat_file))
            if nhits > 0:
                on_table_ii['status'] = 'on_table_{}'.format(on_count)
                #Grouping all of the on hits into one table
                on_table_list.append(on_table_ii)
                on_count += 1

    #If there are no hits on any on target, return to caller
    if len(on_table_list) < 1:
        print('*** find_events: There are no hits in this cadence (on_table_list) :(')
        end_search(t0)
        return None

    #Concatenating the on and off tables into a giant on table
    #and a giant off table
    on_table = pd.concat(on_table_list,ignore_index=True)
    if len(off_table_list) > 0:
        off_table = pd.concat(off_table_list,ignore_index=True)
    else: # nil list of OFF tables
        off_table = []

    #Check that all targets in the on_table come from the same source
    #Fix issue where some sources have B'' format
    uniqlist = list(on_table['Source'].unique())
    for i, ele in enumerate(uniqlist):
        try:
            uniqlist[i] = ele.strip("'B")
        except:
            continue
    if len(set(uniqlist)) > 1:
        raise ValueError('find_events: There are multiple sources in the ON table.'
                         'Please check your input files, '
                         'on_off_first parameter,'
                         'or complex_cadence.')

    #Obtain the start times for each hit in the first on table
    ref_time = float(on_table[on_table['status'] == 'on_table_1']['MJD'].unique()[0])

    #Calculating and saving delta_t, in seconds, to follow a given hit from
    #the first ON table to see if it appears in subsequent ON tables
    on_table['delta_t'] = on_table['MJD'].apply(lambda x:
        (float(x) - ref_time)*3600*24)

    #######################################################################
    print('find_events: All data loaded!')
    print()
    print('find_events: Finding events in this cadence...')
    #######################################################################
    # Using logic statements and pandas capabilities to find events that
    # meet these requirements:
    # * If zero drift rate checking is requested, then filter out hits with
    #      a drift rate exactly = 0.0.  Obsolete parameter?
    # * SNR value > SNR cut.
    # * Drift rate value > min_drift_rate.
    # * Drift rate value < max_drift_rate.
    # * Passed level 1 AND that are in at least one ON but no OFFs.
    # * Passed level 2 AND that are present in *ALL* ONs.

    #Optionally remove signals that don't have a drift rate
    if check_zero_drift:
        zero_adjusted_table = on_table
    else:
        zero_adjusted_table = on_table[on_table['DriftRate'] != 0.0]

    # If SNR_cut is None, include all signals from the zero adjusted table.
    # Else, include only signals with SNRs above the threshold value (SNR_cut).
    if SNR_cut is None:
        filter_adjusted_table = zero_adjusted_table
    else:
        filter_adjusted_table = zero_adjusted_table[ abs(zero_adjusted_table['SNR']) > abs(float(SNR_cut)) ]

    # If min_drift_rate is None, include all signals from the filter_adjusted_table.
    # Else, include only signals with a drift_rate above min_drift_rate.
    if min_drift_rate is not None:
        filter_adjusted_table = filter_adjusted_table[abs(filter_adjusted_table['DriftRate']) > abs(float(min_drift_rate))]

    # If max_drift_rate is None, include all signals from the filter_adjusted_table.
    # Else, include only signals with a drift_rate below max_drift_rate.
    if max_drift_rate is not None:
        filter_adjusted_table = filter_adjusted_table[abs(filter_adjusted_table['DriftRate']) < abs(float(max_drift_rate))]

    #If there are no hits left once SNR adjustments are made,
    #let the user know, return the zero-drift adjusted table, and end the
    #search
    if len(filter_adjusted_table) == 0:
        print('*** find_events: The filter_adjusted_table is empty.  No hits. :(')
        end_search(t0)
        return None
    if filter_threshold == 1:
        print('find_events (Filter threshold 1): Found a total of %i hits in this cadence!'%len(filter_adjusted_table))
        end_search(t0)
        return filter_adjusted_table
    print('find_events (Filter threshold 2/3): Found a total of %i hits in this cadence!'%len(filter_adjusted_table))

    #----------------------------------------------------------------------

    #Now find how much RFI is within a frequency range of the hit
    #by comparing the ON to the OFF observations. Update RFI_in_range
    if len(off_table) == 0:
        print('find_events: Length of off_table = 0')
        filter_adjusted_table['RFI_in_range'] = 0
    else:
        filter_adjusted_table['RFI_in_range'] = filter_adjusted_table.apply(
            lambda hit:
                len(off_table[((off_table['Freq'] > calc_freq_range(hit)[0])
                               & (off_table['Freq'] < calc_freq_range(hit)[1])
                               )]), axis=1)

    #If there is no RFI in range of the hit, it graduates to the
    #not_in_B_table
    not_in_off_table = filter_adjusted_table[filter_adjusted_table['RFI_in_range'] == 0]

    if len(not_in_off_table) == 0:
        print('find_events (Filter threshold 2/3): Found no hits present in only the on observations for this cadence :(')
        end_search(t0)
        return None
    if filter_threshold == 2:
        print('find_events (Filter threshold 2): Found a total of %i hits in only the on observations for this cadence!'%len(not_in_off_table))
        end_search(t0)
        return not_in_off_table

    print('find_events: Found a total of %i hits in only the on observations for this cadence!'%len(not_in_off_table))

    #----------------------------------------------------------------------

    on_but_not_off_table_list = []
    in_all_ons_table = []
    #Follow the drifting of a hit to find the events that
    #are present in ALL on observations

    for i in range(1, number_of_ons + 1):
        on_but_not_off_table_list.append(not_in_off_table[not_in_off_table['status'] == 'on_table_{}'.format(i)])
    empty_counter = 0
    for hit_list in on_but_not_off_table_list:
        if hit_list.empty is True:
            empty_counter += 1
    if empty_counter == 0:
        first_on = on_but_not_off_table_list[0]


        def hit_func(hit):
            """
            Embedded hit_func()
            Args:
              hit:

            Returns:
                counter, int
            """
            counter = 0
            for ii in range(1, len(on_but_not_off_table_list)):
                counter += follow_event(hit, on_but_not_off_table_list[ii])
            return counter


        first_on['in_n_ons'] = first_on.apply(hit_func, axis=1)
        in_all_ons_table = first_on[first_on['in_n_ons'] == number_of_ons - 1]

        #Create list of events.
        filter_3_event_list = []

        # Create an event table of unique entries.
        singularities = []
        for hit_index, hit in in_all_ons_table.iterrows():
            for table in on_but_not_off_table_list:
                temp_table = follow_event(hit, table, get_count=False)
                temp_table['Hit_ID'] = hit['Source']+'_'+str(hit_index)
                if not_yet_seen(singularities, temp_table.iloc[0]['TopHitNum']):
                    filter_3_event_list += [temp_table]

    else:
        print('*** find_events (Filter threshold 3): At least one of the ON tables is empty :(')
        end_search(t0)
        return None

    if len(in_all_ons_table) > 0:
        best_events_table = pd.concat(filter_3_event_list)
        print('find_events: Found a total of {} events across this cadence!'.format(len(best_events_table)))
        end_search(t0)
        return best_events_table

    print('*** find_events (Filter threshold 3): Found no events across this cadence (in_all_ons_table empty)) :(')
    end_search(t0)
    return None